From e4fe55362ded353ef6b88f96fa021095bd03c627 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Sep 2025 10:41:56 +0800
Subject: [PATCH 01/13] add h800

---
 .github/workflows/benchmark.yml           |   4 +-
 .github/workflows/daily_ete_test.yml      |   4 +-
 .github/workflows/daily_ete_test_3090.yml |   8 +-
 .github/workflows/daily_ete_test_5080.yml |  18 +-
 .github/workflows/daily_ete_test_h800.yml | 388 ++++++++++++++++++++++
 autotest/config-h800.yaml                 | 118 +++++++
 6 files changed, 523 insertions(+), 17 deletions(-)
 create mode 100644 .github/workflows/daily_ete_test_h800.yml
 create mode 100644 autotest/config-h800.yaml

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0ee4db9a28..62d2e19ca5 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -27,7 +27,7 @@ on:
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
   REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
@@ -42,7 +42,7 @@ jobs:
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
       PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda11.8
+      DOCKER_TAG: cuda12.4
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index f10bc2993b..8a90df4309 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -44,7 +44,7 @@ on:
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
@@ -64,7 +64,7 @@ jobs:
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
       PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda11.8
+      DOCKER_TAG: cuda12.4
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 21d200a405..6c0a0942c5 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -153,7 +153,7 @@ jobs:
           python3 -m pip install -r requirements/lite.txt
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
           python3 -m pip install -r requirements/test.txt
       - name: Check env
@@ -226,7 +226,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -290,7 +290,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -370,7 +370,7 @@ jobs:
         run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Get coverage report
         run: |
diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
index 7d9b250385..bd12b0beff 100644
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -92,7 +92,7 @@ jobs:
   download_pkgs:
     needs: linux-build
     if: ${{!cancelled()}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     timeout-minutes: 50
     container:
       image: openmmlab/lmdeploy:latest-cu12.8
@@ -129,7 +129,7 @@ jobs:
   test_quantization:
     needs: download_pkgs
     if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     timeout-minutes: 150
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
@@ -153,7 +153,7 @@ jobs:
           python3 -m pip install -r requirements/lite.txt
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
           python3 -m pip install -r requirements/test.txt
       - name: Check env
@@ -188,7 +188,7 @@ jobs:
           chmod -R 777 $workdir
   test_tools:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     needs: test_quantization
     timeout-minutes: 300
     strategy:
@@ -225,7 +225,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -265,7 +265,7 @@ jobs:
           chmod -R 777 $workdir
   test_restful:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     needs: test_quantization
     strategy:
       fail-fast: false
@@ -289,7 +289,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -353,7 +353,7 @@ jobs:
           chmod -R 777 $workdir
   get_coverage_report:
     if: ${{!cancelled() && success()}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     needs: [test_tools, test_restful]
     timeout-minutes: 5
     container:
@@ -368,7 +368,7 @@ jobs:
         run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Get coverage report
         run: |
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
new file mode 100644
index 0000000000..dfcbe2d718
--- /dev/null
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -0,0 +1,388 @@
+name: daily_ete_test_h800
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      model:
+        required: true
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
+        type: string
+        default: '["pipeline", "restful", "chat"]'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['quant', 'tools','restful','pipeline']"
+  schedule:
+    - cron:  '00 14 * * 0-4'
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.8
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, h800-r1]
+    timeout-minutes: 50
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme2/share:/nvme2/share
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    runs-on: [self-hosted, h800-r1]
+    timeout-minutes: 150
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme2/share:/nvme2/share
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install auto_gptq matplotlib attrdict
+          python3 -m pip install -r requirements/lite.txt
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - quantization w4a16
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - quantization w8a8
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_tools:
+    if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: [self-hosted, h800-r1]
+    needs: test_quantization
+    timeout-minutes: 300
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme2/share:/nvme2/share
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.function == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.function == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_restful:
+    if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    runs-on: [self-hosted, h800-r1]
+    needs: test_quantization
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ['Intern-S1', 'internlm2_5-20b-chat', 'internlm2_5-20b']
+        include:
+          - tp: 8
+            model: Intern-S1
+    timeout-minutes: 60
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme2/share:/nvme2/share
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Start restful api
+        if:  matrix.model != 'internlm2_5-20b'
+        run: |
+          lmdeploy serve api_server /nvme/qa_test_models/internlm/${{matrix.model}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_start_chat_restful.log 2>&1 &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api
+        if:  matrix.model == 'Intern-S1'
+        timeout-minutes: 30
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.model}}-${{matrix.backend}}_ ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if:  matrix.model != 'internlm2_5-20b'
+        run: |
+          kill -15 "$restful_pid"
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  get_coverage_report:
+    if: ${{!cancelled() && success()}}
+    runs-on: [self-hosted, h800-r1]
+    needs: [test_tools, test_restful]
+    timeout-minutes: 5
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
new file mode 100644
index 0000000000..11c771f5a4
--- /dev/null
+++ b/autotest/config-h800.yaml
@@ -0,0 +1,118 @@
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: 5080
+
+tp_config:
+    Intern-S1: 8
+    Qwen3-235B-A22B: 8
+    Qwen3-235B-A22B-FP8: 4
+    Qwen3-30B-A3B: 2
+    Qwen3-32B: 2
+    gpt-oss-120b: 2
+
+
+turbomind_chat_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-0.6B-FP8
+    - Qwen/Qwen3-1.7B-FP8
+    - Qwen/Qwen3-4B-FP8
+    - Qwen/Qwen3-8B-FP8
+    - Qwen/Qwen3-14B-FP8
+    - Qwen/Qwen3-235B-A22B
+    - Qwen/Qwen3-235B-A22B-FP8
+    - Qwen/Qwen3-30B-A3B
+    - Qwen/Qwen3-32B
+    - Qwen/Qwen3-32B-FP8
+    - openai/gpt-oss-120b
+    - openai/gpt-oss-20b
+
+pytorch_chat_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-0.6B-FP8
+    - Qwen/Qwen3-1.7B-FP8
+    - Qwen/Qwen3-4B-FP8
+    - Qwen/Qwen3-8B-FP8
+    - Qwen/Qwen3-14B-FP8
+    - Qwen/Qwen3-235B-A22B
+    - Qwen/Qwen3-235B-A22B-FP8
+    - Qwen/Qwen3-30B-A3B
+    - Qwen/Qwen3-32B
+    - Qwen/Qwen3-32B-FP8
+    - openai/gpt-oss-120b
+    - openai/gpt-oss-20b
+
+turbomind_vl_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+
+pytorch_vl_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+
+turbomind_base_model:
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-4B-FP8
+    - openai/gpt-oss-20b
+
+pytorch_base_model:
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-4B-FP8
+    - openai/gpt-oss-20b
+
+turbomind_quatization:
+    no_awq:
+        - empty
+    gptq:
+        - empty
+    no_kvint4:
+        - Qwen/Qwen3-0.6B-FP8
+        - Qwen/Qwen3-1.7B-FP8
+        - Qwen/Qwen3-4B-FP8
+        - Qwen/Qwen3-8B-FP8
+        - Qwen/Qwen3-14B-FP8
+        - Qwen/Qwen3-235B-A22B
+        - Qwen/Qwen3-235B-A22B-FP8
+        - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-32B
+        - Qwen/Qwen3-32B-FP8
+    no_kvint8:
+        - empty
+
+pytorch_quatization:
+    awq:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - Qwen/Qwen3-0.6B-FP8
+        - Qwen/Qwen3-1.7B-FP8
+        - Qwen/Qwen3-4B-FP8
+        - Qwen/Qwen3-8B-FP8
+        - Qwen/Qwen3-14B-FP8
+        - Qwen/Qwen3-235B-A22B
+        - Qwen/Qwen3-235B-A22B-FP8
+        - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-32B
+        - Qwen/Qwen3-32B-FP8
+        - openai/gpt-oss-120b
+        - openai/gpt-oss-20b
+    w8a8:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+    no_kvint4:
+        - Qwen/Qwen3-0.6B-FP8
+        - Qwen/Qwen3-1.7B-FP8
+        - Qwen/Qwen3-4B-FP8
+        - Qwen/Qwen3-8B-FP8
+        - Qwen/Qwen3-14B-FP8
+        - Qwen/Qwen3-235B-A22B
+        - Qwen/Qwen3-235B-A22B-FP8
+        - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-32B
+        - Qwen/Qwen3-32B-FP8
+    no_kvint8:
+        - empty

From 96be64daee2ef4df834703754949954c111bf610 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Sep 2025 10:49:34 +0800
Subject: [PATCH 02/13] update

---
 autotest/config-h800.yaml | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index 11c771f5a4..7a47c5e8b7 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -1,10 +1,10 @@
-model_path: /nvme/qa_test_models
-resource_path: /nvme/qa_test_models/resource
-dst_path: /nvme/qa_test_models/autotest_model
-log_path: /nvme/qa_test_models/autotest_model/log
-benchmark_path: /nvme/qa_test_models/benchmark-reports
-dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
-env_tag: 5080
+model_path: /nvme1/qa_test_models
+resource_path: /nvme1/qa_test_models/resource
+dst_path: /nvme1/qa_test_models/autotest_model
+log_path: /nvme1/qa_test_models/autotest_model/log
+benchmark_path: /nvme1/qa_test_models/benchmark-reports
+dataset_path: /nvme1/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: h800
 
 tp_config:
     Intern-S1: 8
@@ -13,7 +13,8 @@ tp_config:
     Qwen3-30B-A3B: 2
     Qwen3-32B: 2
     gpt-oss-120b: 2
-
+    gpt-oss-120b-BF16: 4
+    gpt-oss-20b-BF16: 2
 
 turbomind_chat_model:
     - internlm/Intern-S1
@@ -44,8 +45,8 @@ pytorch_chat_model:
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-32B
     - Qwen/Qwen3-32B-FP8
-    - openai/gpt-oss-120b
-    - openai/gpt-oss-20b
+    - unsloth/gpt-oss-120b-BF16
+    - unsloth/gpt-oss-20b-BF16
 
 turbomind_vl_model:
     - internlm/Intern-S1

From a1e5bf747754adba602ffce5d8cbe2a1c8df9386 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Sep 2025 11:02:32 +0800
Subject: [PATCH 03/13] update

---
 .github/workflows/daily_ete_test_h800.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index dfcbe2d718..2660d3844e 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -100,6 +100,7 @@ jobs:
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Clone repository
@@ -142,6 +143,7 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Copy repository and Artifacts
@@ -216,6 +218,7 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Copy repository and Artifacts
@@ -303,6 +306,7 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Copy repository and Artifacts

From 6c70f277b3a6d2afd0c647d6a49ccf071f6de336 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Sep 2025 11:11:56 +0800
Subject: [PATCH 04/13] update

---
 .github/workflows/daily_ete_test_h800.yml |  5 +++++
 autotest/config-h800.yaml                 | 12 ++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index 2660d3844e..b58261108f 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -99,6 +99,7 @@ jobs:
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
         - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
@@ -142,6 +143,7 @@ jobs:
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
         - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
@@ -217,6 +219,7 @@ jobs:
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
         - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
@@ -305,6 +308,7 @@ jobs:
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
         - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
@@ -366,6 +370,7 @@ jobs:
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Copy repository and Artifacts
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index 7a47c5e8b7..275a3fdba9 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -1,9 +1,9 @@
-model_path: /nvme1/qa_test_models
-resource_path: /nvme1/qa_test_models/resource
-dst_path: /nvme1/qa_test_models/autotest_model
-log_path: /nvme1/qa_test_models/autotest_model/log
-benchmark_path: /nvme1/qa_test_models/benchmark-reports
-dataset_path: /nvme1/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 env_tag: h800
 
 tp_config:

From 1c4c42ae82d945720c1deb389c3f5204cdc1fded Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Sep 2025 11:23:45 +0800
Subject: [PATCH 05/13] update mark

---
 .github/workflows/daily_ete_test_h800.yml     | 24 +++++++++----------
 .../chat/test_command_chat_hf_pytorch.py      |  3 +++
 .../chat/test_command_chat_hf_turbomind.py    |  5 ++++
 .../test_pipeline_chat_pytorch_llm.py         |  3 +++
 .../test_pipeline_chat_turbomind_llm.py       |  5 ++++
 .../test_pipeline_chat_turbomind_mllm.py      |  2 ++
 .../test_restful_chat_hf_pytorch_llm.py       |  8 +++++++
 .../test_restful_chat_hf_turbomind_llm.py     |  8 +++++++
 8 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index b58261108f..1c2669d27e 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -247,37 +247,37 @@ jobs:
         continue-on-error: true
         if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
         if: matrix.function == 'pipeline'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
         if: matrix.function == 'restful'
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 3c13cb1ebf..9f84eb9d8f 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -296,6 +296,7 @@ def test_hf_pytorch_chat_pr(config, model, cli_case_config):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
@@ -319,6 +320,7 @@ def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf'])
 def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -340,6 +342,7 @@ def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat'])
 def test_pytorch_chat_with_lora_tp2(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index d7cb3770ed..966f87efb6 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -261,6 +261,7 @@ def test_hf_turbomind_chat_kvint4_tp8(config, model, communicator, cli_case_conf
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
@@ -288,6 +289,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
@@ -315,6 +317,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli_case_config, worker_id):
@@ -337,6 +340,7 @@ def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicator, cli_case_config, worker_id):
@@ -454,6 +458,7 @@ def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_turbomind_chat_tp1(config, model, cli_case_config, worker_id):
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index dca119649e..dec6c31798 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -157,6 +157,7 @@ def test_pipeline_chat_pytorch_pr(config, common_case_config, model, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id):
@@ -171,6 +172,7 @@ def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf'])
 def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, worker_id):
@@ -186,6 +188,7 @@ def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat'])
 def test_pipeline_chat_pytorch_with_lora_tp2(config, common_case_config, model, worker_id):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index c92d6aa148..647e310863 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -179,6 +179,7 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
@@ -201,6 +202,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
@@ -226,6 +228,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id):
@@ -245,6 +248,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id):
@@ -286,6 +290,7 @@ def test_pipeline_chat_pr(config, common_case_config, model, communicator, worke
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pipeline_chat_tp1(config, common_case_config, model, worker_id):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index f14cdf605d..d095e4fd98 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -150,6 +150,7 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id):
@@ -163,6 +164,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id):
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index eaf574c591..f1aff303cd 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -172,6 +172,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'Qwen/Qwen2.5-7B-Instruct',
     'cuda_prefix': None,
@@ -190,6 +191,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'meta-llama/Llama-2-7b-chat-hf',
     'cuda_prefix': None,
@@ -208,6 +210,7 @@ def test_restful_chat_with_lora_tp1(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment',
                          [{
                              'model': 'baichuan-inc/Baichuan2-13B-Chat',
@@ -228,6 +231,7 @@ def test_restful_chat_with_lora_tp2(config, common_case_config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
@@ -249,6 +253,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
@@ -270,6 +275,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-7b-chat',
@@ -297,6 +303,7 @@ def test_restful_chat_tools_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-20b-chat',
@@ -318,6 +325,7 @@ def test_restful_chat_tools_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_4
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct',
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index b692bd17b5..4f61b88438 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -180,6 +180,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'microsoft/Phi-3-mini-4k-instruct',
@@ -246,6 +247,7 @@ def test_restful_chat_fallback_backend_tp1(config, common_case_config, worker_id
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'google/gemma-2-27b-it',
@@ -357,6 +359,7 @@ def test_restful_logprobs(worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'Qwen/Qwen2.5-7B-Instruct',
     'cuda_prefix': None,
@@ -376,6 +379,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
@@ -397,6 +401,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
@@ -418,6 +423,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-7b-chat',
@@ -445,6 +451,7 @@ def test_restful_chat_tools_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-20b-chat',
@@ -466,6 +473,7 @@ def test_restful_chat_tools_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_4
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct',

From 8dcfc9d1b46904f407b8f60c14196bf190fdbd26 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Sep 2025 15:17:47 +0800
Subject: [PATCH 06/13] update test image for cu12.4

---
 .github/workflows/daily_ete_test.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 8a90df4309..3e78d3c957 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -96,7 +96,7 @@ jobs:
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 50
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -136,7 +136,7 @@ jobs:
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -219,7 +219,7 @@ jobs:
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -324,7 +324,7 @@ jobs:
             model: Intern-S1
     timeout-minutes: 60
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -408,7 +408,7 @@ jobs:
     needs: test_quantization
     timeout-minutes: 120
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -465,7 +465,7 @@ jobs:
     needs: test_quantization
     timeout-minutes: 120
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -520,7 +520,7 @@ jobs:
       matrix:
         evaluate_type: ['chat', 'base']
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -594,7 +594,7 @@ jobs:
     timeout-minutes: 5
     runs-on: [self-hosted, linux-a100]
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -619,7 +619,7 @@ jobs:
     needs: [test_tools, test_restful, test_pipeline, test_benchmark]
     timeout-minutes: 5
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip

From 3528a7e464dd526a5ded460afde67e218b45e842 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Sep 2025 19:47:26 +0800
Subject: [PATCH 07/13] update

---
 .github/workflows/daily_ete_test_h800.yml | 73 ++---------------------
 autotest/config-h800.yaml                 | 27 ++++-----
 2 files changed, 18 insertions(+), 82 deletions(-)

diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index 1c2669d27e..9036a039a6 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -128,73 +128,10 @@ jobs:
         if: ${{inputs.offline_mode}}
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
 
-  test_quantization:
-    needs: download_pkgs
-    if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
-    runs-on: [self-hosted, h800-r1]
-    timeout-minutes: 150
-    env:
-      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
-      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
-      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
-    container:
-      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /nvme1/qa_test_models:/nvme1/qa_test_models
-        - /nvme2/share:/nvme2/share
-        - /mnt/137_nvme4:/mnt/137_nvme4
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install auto_gptq matplotlib attrdict
-          python3 -m pip install -r requirements/lite.txt
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
-          python3 -m pip install -r requirements/test.txt
-      - name: Check env
-        run: |
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
-      - name: Test lmdeploy - quantization w4a16
-        continue-on-error: true
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
-        run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - quantization w8a8
-        continue-on-error: true
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch')
-        run: |
-          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Clear workfile
-        if: always()
-        run: |
-          chmod -R 777 $REPORT_DIR
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
-
   test_tools:
-    if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, h800-r1]
-    needs: test_quantization
+    needs: download_pkgs
     timeout-minutes: 300
     strategy:
       fail-fast: false
@@ -290,14 +227,14 @@ jobs:
           chmod -R 777 $workdir
 
   test_restful:
-    if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, h800-r1]
-    needs: test_quantization
+    needs: download_pkgs
     strategy:
       fail-fast: false
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
-        model: ['Intern-S1', 'internlm2_5-20b-chat', 'internlm2_5-20b']
+        model: ['Intern-S1']
         include:
           - tp: 8
             model: Intern-S1
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index 275a3fdba9..318b6dc465 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -68,10 +68,8 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
-        - empty
-    gptq:
-        - empty
-    no_kvint4:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
@@ -82,13 +80,11 @@ turbomind_quatization:
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
-    no_kvint8:
+        - openai/gpt-oss-120b
+        - openai/gpt-oss-20b
+    gptq:
         - empty
-
-pytorch_quatization:
-    awq:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+    no_kvint4:
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
@@ -99,11 +95,14 @@ pytorch_quatization:
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
-        - openai/gpt-oss-120b
-        - openai/gpt-oss-20b
+    no_kvint8:
+        - empty
+
+pytorch_quatization:
+    awq:
+        - empty
     w8a8:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - empty
     no_kvint4:
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8

From 1b89cb27301fbdcb56787b0f11db07df330a0553 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Wed, 17 Sep 2025 10:22:56 +0800
Subject: [PATCH 08/13] update

---
 .github/workflows/daily_ete_test_h800.yml |  4 ++--
 autotest/config.yaml                      | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index 9036a039a6..ba2a9c6560 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -37,7 +37,7 @@ on:
         required: true
         description: 'regression functions'
         type: string
-        default: "['quant', 'tools','restful','pipeline']"
+        default: "['tools','restful']"
   schedule:
     - cron:  '00 14 * * 0-4'
 
@@ -115,7 +115,7 @@ jobs:
         run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
       - name: Copy repository - offline
         if: ${{inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
       - name: Download Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
diff --git a/autotest/config.yaml b/autotest/config.yaml
index fab9a5af89..5844758229 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -33,8 +33,8 @@ tp_config:
     MiniCPM-V-2_6: 2
     gemma-2-27b-it: 2
     InternVL2-Llama3-76B-AWQ: 4
-    gpt-oss-20b: 2
-    gpt-oss-120b: 4
+    gpt-oss-20b-bf16: 2
+    gpt-oss-120b-bf16: 4
 
 
 turbomind_chat_model:
@@ -139,8 +139,8 @@ pytorch_chat_model:
     - Qwen/Qwen2.5-VL-32B-Instruct
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - openai/gpt-oss-20b
-    - openai/gpt-oss-120b
+    - lmsys/gpt-oss-20b-bf16
+    - lmsys/gpt-oss-120b-bf16
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-3-12b-it
@@ -368,5 +368,5 @@ benchmark_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
     - deepseek-ai/DeepSeek-V2-Lite-Chat
-    - openai/gpt-oss-20b
-    - openai/gpt-oss-120b
+    - lmsys/gpt-oss-20b-bf16
+    - lmsys/gpt-oss-120b-bf16

From 06d0532625e61a0fab03659bf820c599c13479c3 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Wed, 17 Sep 2025 10:30:09 +0800
Subject: [PATCH 09/13] update

---
 .github/workflows/daily_ete_test.yml                 | 12 ++++++------
 .github/workflows/daily_ete_test_3090.yml            |  6 +++---
 .github/workflows/daily_ete_test_5080.yml            |  6 +++---
 .github/workflows/daily_ete_test_h800.yml            |  4 ++--
 .../restful/test_restful_chat_hf_turbomind_mllm.py   |  1 +
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 3e78d3c957..8b8969ada4 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -168,7 +168,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
@@ -251,7 +251,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
@@ -352,7 +352,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api
         if:  matrix.model != 'internlm2_5-20b'
@@ -436,7 +436,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - interface pipeline case
         run: |
@@ -493,7 +493,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test benchmark script
         run: |
@@ -560,7 +560,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Setup paths for evaluation
         run: |
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 6c0a0942c5..9243887ecf 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -163,7 +163,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
@@ -235,7 +235,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
@@ -299,7 +299,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api turbomind
         if: matrix.backend == 'turbomind'
diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
index bd12b0beff..3a080f2615 100644
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -163,7 +163,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
@@ -234,7 +234,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
@@ -298,7 +298,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api turbomind
         if: matrix.backend == 'turbomind'
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index ba2a9c6560..d6ad6547fd 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -178,7 +178,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
@@ -267,7 +267,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api
         if:  matrix.model != 'internlm2_5-20b'
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
index 2cfbc00020..c238ccd962 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -146,6 +146,7 @@ def test_restful_chat_kvint8_tp4(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'OpenGVLab/InternVL2-4B',

From 2e128dc6dd09ddf6e6c486b1c76c5b41a29db6cb Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Wed, 17 Sep 2025 11:21:55 +0800
Subject: [PATCH 10/13] update

---
 .github/workflows/benchmark.yml           | 2 +-
 .github/workflows/daily_ete_test.yml      | 5 ++---
 .github/workflows/daily_ete_test_h800.yml | 2 +-
 .github/workflows/evaluate.yml            | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 62d2e19ca5..00f7e64b61 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -108,7 +108,7 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 8b8969ada4..1c2f0b549d 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -550,8 +550,7 @@ jobs:
         run: |
           git clone --depth=1 https://github.com/open-compass/opencompass.git
           cd opencompass
-          cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt
-          python3 -m pip install -e .
+          python3 -m pip install .
           echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
       - name: Check env
         run: |
@@ -571,7 +570,7 @@ jobs:
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
       - name: Evaluate base models
         if: matrix.evaluate_type == 'base'
         run: |
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index d6ad6547fd..43b9dc4417 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -169,7 +169,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 47e1929421..be64e8743f 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -136,7 +136,7 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}

From a30c72d79d12e58d59a35e7cd6c3530142826ee9 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Wed, 17 Sep 2025 16:09:31 +0800
Subject: [PATCH 11/13] update

---
 .github/workflows/daily_ete_test_h800.yml | 2 +-
 autotest/config-h800.yaml                 | 2 +-
 autotest/tools/pipeline/llm_case.py       | 2 +-
 autotest/tools/pipeline/mllm_case.py      | 2 +-
 autotest/utils/run_restful_chat.py        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index 43b9dc4417..db6131845e 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -289,7 +289,7 @@ jobs:
         if: always()
         run: |
           chmod -R 777 $REPORT_DIR
-          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+          chmod -R 777 ${{env.REPORT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index 318b6dc465..1a3600a72d 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -9,7 +9,7 @@ env_tag: h800
 tp_config:
     Intern-S1: 8
     Qwen3-235B-A22B: 8
-    Qwen3-235B-A22B-FP8: 4
+    Qwen3-235B-A22B-FP8: 8
     Qwen3-30B-A3B: 2
     Qwen3-32B: 2
     gpt-oss-120b: 2
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 2de77d2bd3..17c3f58376 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -7,7 +7,7 @@
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
 from lmdeploy.utils import is_bf16_supported
 
-gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 
 
 def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None):
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 9689581ef9..050ae1e1b7 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -10,7 +10,7 @@
 from lmdeploy.vl.constants import IMAGE_TOKEN
 from lmdeploy.vl.utils import encode_image_base64
 
-gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 
 PIC1 = 'tiger.jpeg'
 PIC2 = 'human-pose.jpg'
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index bd7c7244a2..672a235ead 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -144,7 +144,7 @@ def run_all_step(config, cases_info, worker_id: str = '', port: int = DEFAULT_PO
 
         case_info = cases_info.get(case)
 
-        with allure.step(case + ' step2 - restful_test - openai chat'):
+        with allure.step(case + ' restful_test - openai chat'):
             restful_result, restful_log, msg = open_chat_test(config, case, case_info, model, http_url, worker_id)
             allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT)
         with assume:

From b375a0a60c3ffc536756064c94a27c2b036e1ad9 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Thu, 18 Sep 2025 18:08:28 +0800
Subject: [PATCH 12/13] remove communicator native when tp=1

---
 .github/workflows/daily_ete_test_h800.yml     |  4 ++
 autotest/config-h800.yaml                     |  7 ++-
 .../chat/test_command_chat_hf_turbomind.py    | 37 ++++++----------
 .../test_pipeline_chat_turbomind_llm.py       | 39 +++++------------
 .../test_pipeline_chat_turbomind_mllm.py      | 43 ++++++-------------
 .../test_restful_chat_hf_turbomind_llm.py     | 24 ++---------
 .../test_restful_chat_hf_turbomind_mllm.py    | 12 +++---
 autotest/utils/config_utils.py                |  4 +-
 autotest/utils/pipeline_chat.py               |  5 ++-
 9 files changed, 59 insertions(+), 116 deletions(-)

diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index db6131845e..1dab90bebf 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -158,6 +158,8 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme2:/mnt/137_nvme2
+        - /mnt/137_nvme3:/mnt/137_nvme3
         - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
@@ -247,6 +249,8 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme2:/mnt/137_nvme2
+        - /mnt/137_nvme3:/mnt/137_nvme3
         - /mnt/137_nvme4:/mnt/137_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index 1a3600a72d..068d074452 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -27,6 +27,7 @@ turbomind_chat_model:
     - Qwen/Qwen3-235B-A22B
     - Qwen/Qwen3-235B-A22B-FP8
     - Qwen/Qwen3-30B-A3B
+    - Qwen/Qwen3-30B-A3B-FP8
     - Qwen/Qwen3-32B
     - Qwen/Qwen3-32B-FP8
     - openai/gpt-oss-120b
@@ -43,6 +44,7 @@ pytorch_chat_model:
     - Qwen/Qwen3-235B-A22B
     - Qwen/Qwen3-235B-A22B-FP8
     - Qwen/Qwen3-30B-A3B
+    - Qwen/Qwen3-30B-A3B-FP8
     - Qwen/Qwen3-32B
     - Qwen/Qwen3-32B-FP8
     - unsloth/gpt-oss-120b-BF16
@@ -64,7 +66,7 @@ turbomind_base_model:
 pytorch_base_model:
     - internlm/Intern-S1-mini
     - Qwen/Qwen3-4B-FP8
-    - openai/gpt-oss-20b
+    - unsloth/gpt-oss-20b-BF16
 
 turbomind_quatization:
     no_awq:
@@ -78,6 +80,7 @@ turbomind_quatization:
         - Qwen/Qwen3-235B-A22B
         - Qwen/Qwen3-235B-A22B-FP8
         - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-FP8
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
         - openai/gpt-oss-120b
@@ -93,6 +96,7 @@ turbomind_quatization:
         - Qwen/Qwen3-235B-A22B
         - Qwen/Qwen3-235B-A22B-FP8
         - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-FP8
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
     no_kvint8:
@@ -112,6 +116,7 @@ pytorch_quatization:
         - Qwen/Qwen3-235B-A22B
         - Qwen/Qwen3-235B-A22B-FP8
         - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-FP8
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
     no_kvint8:
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index 966f87efb6..7e8250daa4 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -12,8 +12,7 @@
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -22,8 +21,7 @@ def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, wor
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator}')
+                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -103,8 +101,7 @@ def test_hf_turbomind_chat_tp8(config, model, communicator, cli_case_config, wor
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_kvint4_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -114,7 +111,7 @@ def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_conf
                                                  model,
                                                  'turbomind',
                                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator} --quant-policy 4')
+                                                 extra='--quant-policy 4')
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -172,8 +169,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_kvint8_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -183,7 +179,7 @@ def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_conf
                                                  model,
                                                  'turbomind',
                                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator} --quant-policy 8')
+                                                 extra='--quant-policy 8')
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -266,8 +262,7 @@ def test_hf_turbomind_chat_kvint4_tp8(config, model, communicator, cli_case_conf
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_fallback_backend_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -276,8 +271,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator}')
+                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -294,8 +288,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -305,7 +298,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat
                                                  model,
                                                  'turbomind',
                                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator} --quant-policy 8')
+                                                 extra='--quant-policy 8')
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -365,7 +358,6 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id):
     usercase = 'base_testcase'
     result, chat_log, msg = hf_command_line_test(config,
@@ -373,8 +365,7 @@ def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, wor
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator}')
+                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -436,8 +427,7 @@ def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config):
 @pytest.mark.gpu_num_1
 @pytest.mark.pr_test
 @pytest.mark.parametrize('model', ['OpenGVLab/InternVL3-8B'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config):
+def test_hf_turbomind_chat_pr_gpu1(config, model, cli_case_config):
     usercase = 'chat_testcase'
 
     result, chat_log, msg = hf_command_line_test(config,
@@ -445,8 +435,7 @@ def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config)
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix='CUDA_VISIBLE_DEVICES=5,6',
-                                                 extra=f'--communicator {communicator}')
+                                                 cuda_prefix='CUDA_VISIBLE_DEVICES=5,6')
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 647e310863..fcac93e935 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -12,11 +12,10 @@
 @pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator})
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {})
 
 
 @pytest.mark.order(6)
@@ -65,14 +64,10 @@ def test_pipeline_chat_tp8(config, common_case_config, model, communicator, work
 @pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
-        'quant_policy': 4,
-        'communicator': communicator
-    })
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 4})
 
 
 @pytest.mark.order(6)
@@ -116,14 +111,10 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato
 @pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
-        'quant_policy': 8,
-        'communicator': communicator
-    })
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 8})
 
 
 @pytest.mark.order(6)
@@ -184,17 +175,11 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_chat_test(config,
-                           common_case_config,
-                           model,
-                           'turbomind',
-                           worker_id, {'communicator': communicator},
-                           is_smoke=True)
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {}, is_smoke=True)
 
 
 @pytest.mark.order(6)
@@ -207,8 +192,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
@@ -216,10 +200,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m
                            common_case_config,
                            model,
                            'turbomind-kvint',
-                           worker_id, {
-                               'quant_policy': 8,
-                               'communicator': communicator
-                           },
+                           worker_id, {'quant_policy': 8},
                            is_smoke=True)
 
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index d095e4fd98..bce455d508 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -14,11 +14,10 @@
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator})
+    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {})
 
 
 @pytest.mark.order(6)
@@ -56,14 +55,10 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
-        'quant_policy': 4,
-        'communicator': communicator
-    })
+    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4})
 
 
 @pytest.mark.order(6)
@@ -104,14 +99,10 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
-        'quant_policy': 8,
-        'communicator': communicator
-    })
+    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8})
 
 
 @pytest.mark.order(6)
@@ -152,12 +143,11 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.other
 @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True)
+    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True)
 
 
 @pytest.mark.order(6)
@@ -166,19 +156,11 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_
 @pytest.mark.gpu_num_2
 @pytest.mark.other
 @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_vl_chat_test(config,
-                              model,
-                              BACKEND_KVINT,
-                              worker_id, {
-                                  'quant_policy': 8,
-                                  'communicator': communicator
-                              },
-                              is_smoke=True)
+    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True)
 
 
 @pytest.mark.pipeline_chat
@@ -188,8 +170,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator,
 @pytest.mark.parametrize(
     'model',
     ['liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B', 'OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL3-8B'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_pr_test(config, model, communicator, worker_id):
+def test_pipeline_pr_test(config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5)
-    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True)
+    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index 4f61b88438..519ed6718f 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -201,37 +201,19 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id):
         'model': 'microsoft/Phi-3-mini-4k-instruct',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --communicator native'
-    },
-    {
-        'model': 'microsoft/Phi-3-mini-4k-instruct',
-        'cuda_prefix': None,
-        'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
-    },
-    {
-        'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
-        'cuda_prefix': None,
-        'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
-    },
-    {
-        'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8',
-        'cuda_prefix': None,
-        'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
 ],
                          indirect=True)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
index c238ccd962..a98fbbfd6c 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -19,7 +19,7 @@ def prepare_environment(request, config, worker_id):
 
 def getModelList(tp_num):
     model_list = []
-    for communicator in get_communicator_list():
+    for communicator in get_communicator_list(tp_num):
         model_list += [{
             'model': item,
             'cuda_prefix': None,
@@ -65,7 +65,7 @@ def test_restful_chat_tp4(config, worker_id):
 
 def getKvintModelList(tp_num, quant_policy: int = None):
     model_list = []
-    for communicator in get_communicator_list():
+    for communicator in get_communicator_list(tp_num):
         model_list += [{
             'model': item,
             'cuda_prefix': None,
@@ -172,25 +172,25 @@ def test_restful_chat_kvint8_tp4(config, worker_id):
         'model': 'OpenGVLab/InternVL2-4B',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'Qwen/Qwen2.5-VL-7B-Instruct',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'THUDM/glm-4v-9b',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'THUDM/glm-4v-9b-inner-4bits',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
 ],
                          indirect=True)
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index c53e33bf0f..e9e49a3666 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -83,8 +83,8 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type:
     return case_list
 
 
-def get_communicator_list():
-    if is_bf16_supported():
+def get_communicator_list(tp_num: int = None):
+    if tp_num != 1 and is_bf16_supported():
         return ['native', 'nccl']
     return ['nccl']
 
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 01c686f7dd..8fc48d92b5 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -279,13 +279,14 @@ def internvl_vl_testcase(output_text, f, lang: str = 'en'):
             assert case_result, f'reason: separate images: panda should in {response}'
     with allure.step(f'internvl-separate-images2-{lang}'):
         response = get_response_from_output(output_text, f'internvl-separate-images2-{lang}')
-        case_result = any(word in response.lower() for word in ['panda', '熊猫', 'same', 'different', 'difference'])
+        case_result = any(word in response.lower()
+                          for word in ['panda', '熊猫', 'same', 'different', 'difference', 'identical'])
         f.writelines(f'internvl-separate-images2-{lang} result: {case_result}, reason: panda should in {response} \n')
         with assume:
             assert case_result, f'reason: separate images2: panda should in {response}'
     with allure.step(f'internvl-video-{lang}'):
         response = get_response_from_output(output_text, f'internvl-video-{lang}')
-        case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food'])
+        case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food', 'hold'])
         f.writelines(f'internvl-video-{lang} result: {case_result}, reason: panda should in {response} \n')
         with assume:
             assert case_result, f'reason: video: panda should in {response}'

From 75d2089ddfbbca47498c9d0664c717f31b820ef3 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Thu, 18 Sep 2025 18:26:13 +0800
Subject: [PATCH 13/13] fix lint

---
 .../chat/test_command_chat_hf_turbomind.py     |  4 ++--
 .../test_pipeline_chat_turbomind_mllm.py       | 18 +++++++++---------
 .../test_restful_chat_hf_turbomind_llm.py      |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index 991db6b383..82c747358b 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -358,7 +358,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model'))
-def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_base_tp1(config, model, cli_case_config, worker_id):
     usercase = 'base_testcase'
     result, chat_log, msg = hf_command_line_test(config,
                                                  usercase,
@@ -438,7 +438,7 @@ def test_hf_turbomind_chat_pr_gpu1(config, model, cli_case_config):
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix='CUDA_VISIBLE_DEVICES=5,6')
+                                                 cuda_prefix=env_var + '5,6')
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 7415243ec4..bcfd071eba 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -161,14 +161,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_vl_chat_test(config,
-                              model,
-                              BACKEND_KVINT,
-                              worker_id, {
-                                  'quant_policy': 8,
-                                  'communicator': communicator
-                              },
-                              is_smoke=True)
+    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True)
 
 
 @pytest.mark.order(6)
@@ -181,7 +174,14 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator,
     if 'gw' in worker_id:
         set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True)
+    run_pipeline_vl_chat_test(config,
+                              model,
+                              BACKEND_KVINT,
+                              worker_id, {
+                                  'quant_policy': 8,
+                                  'communicator': communicator
+                              },
+                              is_smoke=True)
 
 
 @pytest.mark.pipeline_chat
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index 519ed6718f..daf2664662 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -80,7 +80,7 @@ def test_restful_chat_tp8(config, common_case_config, worker_id):
 
 def getKvintModelList(tp_num, quant_policy):
     model_list = []
-    for communicator in get_communicator_list():
+    for communicator in get_communicator_list(tp_num):
         model_list += [{
             'model': item,
             'cuda_prefix': None,