diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0ee4db9a28..00f7e64b61 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -27,7 +27,7 @@ on:
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
   REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
@@ -42,7 +42,7 @@ jobs:
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
       PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda11.8
+      DOCKER_TAG: cuda12.4
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
@@ -108,7 +108,7 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index f10bc2993b..1c2f0b549d 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -44,7 +44,7 @@ on:
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
@@ -64,7 +64,7 @@ jobs:
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
       PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda11.8
+      DOCKER_TAG: cuda12.4
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
@@ -96,7 +96,7 @@ jobs:
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 50
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -136,7 +136,7 @@ jobs:
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -168,7 +168,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
@@ -219,7 +219,7 @@ jobs:
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -251,7 +251,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
@@ -324,7 +324,7 @@ jobs:
             model: Intern-S1
     timeout-minutes: 60
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -352,7 +352,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api
         if:  matrix.model != 'internlm2_5-20b'
@@ -408,7 +408,7 @@ jobs:
     needs: test_quantization
     timeout-minutes: 120
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -436,7 +436,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - interface pipeline case
         run: |
@@ -465,7 +465,7 @@ jobs:
     needs: test_quantization
     timeout-minutes: 120
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -493,7 +493,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test benchmark script
         run: |
@@ -520,7 +520,7 @@ jobs:
       matrix:
         evaluate_type: ['chat', 'base']
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -550,8 +550,7 @@ jobs:
         run: |
           git clone --depth=1 https://github.com/open-compass/opencompass.git
           cd opencompass
-          cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt
-          python3 -m pip install -e .
+          python3 -m pip install .
           echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
       - name: Check env
         run: |
@@ -560,7 +559,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Setup paths for evaluation
         run: |
@@ -571,7 +570,7 @@ jobs:
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
       - name: Evaluate base models
         if: matrix.evaluate_type == 'base'
         run: |
@@ -594,7 +593,7 @@ jobs:
     timeout-minutes: 5
     runs-on: [self-hosted, linux-a100]
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -619,7 +618,7 @@ jobs:
     needs: [test_tools, test_restful, test_pipeline, test_benchmark]
     timeout-minutes: 5
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 21d200a405..9243887ecf 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -153,7 +153,7 @@ jobs:
           python3 -m pip install -r requirements/lite.txt
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
           python3 -m pip install -r requirements/test.txt
       - name: Check env
@@ -163,7 +163,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
@@ -226,7 +226,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -235,7 +235,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
@@ -290,7 +290,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -299,7 +299,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api turbomind
         if: matrix.backend == 'turbomind'
@@ -370,7 +370,7 @@ jobs:
         run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Get coverage report
         run: |
diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
index 7d9b250385..3a080f2615 100644
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -92,7 +92,7 @@ jobs:
   download_pkgs:
     needs: linux-build
     if: ${{!cancelled()}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     timeout-minutes: 50
     container:
       image: openmmlab/lmdeploy:latest-cu12.8
@@ -129,7 +129,7 @@ jobs:
   test_quantization:
     needs: download_pkgs
     if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     timeout-minutes: 150
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
@@ -153,7 +153,7 @@ jobs:
           python3 -m pip install -r requirements/lite.txt
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
           python3 -m pip install -r requirements/test.txt
       - name: Check env
@@ -163,7 +163,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
@@ -188,7 +188,7 @@ jobs:
           chmod -R 777 $workdir
   test_tools:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     needs: test_quantization
     timeout-minutes: 300
     strategy:
@@ -225,7 +225,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -234,7 +234,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
@@ -265,7 +265,7 @@ jobs:
           chmod -R 777 $workdir
   test_restful:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     needs: test_quantization
     strategy:
       fail-fast: false
@@ -289,7 +289,7 @@ jobs:
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -298,7 +298,7 @@ jobs:
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api turbomind
         if: matrix.backend == 'turbomind'
@@ -353,7 +353,7 @@ jobs:
           chmod -R 777 $workdir
   get_coverage_report:
     if: ${{!cancelled() && success()}}
-    runs-on: [self-hosted, 5090-r1]
+    runs-on: [self-hosted, 5080-r1]
     needs: [test_tools, test_restful]
     timeout-minutes: 5
     container:
@@ -368,7 +368,7 @@ jobs:
         run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Get coverage report
         run: |
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
new file mode 100644
index 0000000000..1dab90bebf
--- /dev/null
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -0,0 +1,338 @@
+name: daily_ete_test_h800
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      model:
+        required: true
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
+        type: string
+        default: '["pipeline", "restful", "chat"]'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['tools','restful']"
+  schedule:
+    - cron:  '00 14 * * 0-4'
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.8
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, h800-r1]
+    timeout-minutes: 50
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
+        - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme4:/mnt/137_nvme4
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+  test_tools:
+    if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: [self-hosted, h800-r1]
+    needs: download_pkgs
+    timeout-minutes: 300
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
+        - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme2:/mnt/137_nvme2
+        - /mnt/137_nvme3:/mnt/137_nvme3
+        - /mnt/137_nvme4:/mnt/137_nvme4
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.function == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.function == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_restful:
+    if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    runs-on: [self-hosted, h800-r1]
+    needs: download_pkgs
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ['Intern-S1']
+        include:
+          - tp: 8
+            model: Intern-S1
+    timeout-minutes: 60
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
+        - /nvme2/share:/nvme2/share
+        - /mnt/137_nvme2:/mnt/137_nvme2
+        - /mnt/137_nvme3:/mnt/137_nvme3
+        - /mnt/137_nvme4:/mnt/137_nvme4
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Start restful api
+        if:  matrix.model != 'internlm2_5-20b'
+        run: |
+          lmdeploy serve api_server /nvme/qa_test_models/internlm/${{matrix.model}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_start_chat_restful.log 2>&1 &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api
+        if:  matrix.model == 'Intern-S1'
+        timeout-minutes: 30
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.model}}-${{matrix.backend}}_ ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if:  matrix.model != 'internlm2_5-20b'
+        run: |
+          kill -15 "$restful_pid"
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          chmod -R 777 ${{env.REPORT_DIR}}
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  get_coverage_report:
+    if: ${{!cancelled() && success()}}
+    runs-on: [self-hosted, h800-r1]
+    needs: [test_tools, test_restful]
+    timeout-minutes: 5
+    container:
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme1/qa_test_models:/nvme1/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 47e1929421..be64e8743f 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -136,7 +136,7 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
new file mode 100644
index 0000000000..068d074452
--- /dev/null
+++ b/autotest/config-h800.yaml
@@ -0,0 +1,123 @@
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: h800
+
+tp_config:
+    Intern-S1: 8
+    Qwen3-235B-A22B: 8
+    Qwen3-235B-A22B-FP8: 8
+    Qwen3-30B-A3B: 2
+    Qwen3-32B: 2
+    gpt-oss-120b: 2
+    gpt-oss-120b-BF16: 4
+    gpt-oss-20b-BF16: 2
+
+turbomind_chat_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-0.6B-FP8
+    - Qwen/Qwen3-1.7B-FP8
+    - Qwen/Qwen3-4B-FP8
+    - Qwen/Qwen3-8B-FP8
+    - Qwen/Qwen3-14B-FP8
+    - Qwen/Qwen3-235B-A22B
+    - Qwen/Qwen3-235B-A22B-FP8
+    - Qwen/Qwen3-30B-A3B
+    - Qwen/Qwen3-30B-A3B-FP8
+    - Qwen/Qwen3-32B
+    - Qwen/Qwen3-32B-FP8
+    - openai/gpt-oss-120b
+    - openai/gpt-oss-20b
+
+pytorch_chat_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-0.6B-FP8
+    - Qwen/Qwen3-1.7B-FP8
+    - Qwen/Qwen3-4B-FP8
+    - Qwen/Qwen3-8B-FP8
+    - Qwen/Qwen3-14B-FP8
+    - Qwen/Qwen3-235B-A22B
+    - Qwen/Qwen3-235B-A22B-FP8
+    - Qwen/Qwen3-30B-A3B
+    - Qwen/Qwen3-30B-A3B-FP8
+    - Qwen/Qwen3-32B
+    - Qwen/Qwen3-32B-FP8
+    - unsloth/gpt-oss-120b-BF16
+    - unsloth/gpt-oss-20b-BF16
+
+turbomind_vl_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+
+pytorch_vl_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+
+turbomind_base_model:
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-4B-FP8
+    - openai/gpt-oss-20b
+
+pytorch_base_model:
+    - internlm/Intern-S1-mini
+    - Qwen/Qwen3-4B-FP8
+    - unsloth/gpt-oss-20b-BF16
+
+turbomind_quatization:
+    no_awq:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - Qwen/Qwen3-0.6B-FP8
+        - Qwen/Qwen3-1.7B-FP8
+        - Qwen/Qwen3-4B-FP8
+        - Qwen/Qwen3-8B-FP8
+        - Qwen/Qwen3-14B-FP8
+        - Qwen/Qwen3-235B-A22B
+        - Qwen/Qwen3-235B-A22B-FP8
+        - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-FP8
+        - Qwen/Qwen3-32B
+        - Qwen/Qwen3-32B-FP8
+        - openai/gpt-oss-120b
+        - openai/gpt-oss-20b
+    gptq:
+        - empty
+    no_kvint4:
+        - Qwen/Qwen3-0.6B-FP8
+        - Qwen/Qwen3-1.7B-FP8
+        - Qwen/Qwen3-4B-FP8
+        - Qwen/Qwen3-8B-FP8
+        - Qwen/Qwen3-14B-FP8
+        - Qwen/Qwen3-235B-A22B
+        - Qwen/Qwen3-235B-A22B-FP8
+        - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-FP8
+        - Qwen/Qwen3-32B
+        - Qwen/Qwen3-32B-FP8
+    no_kvint8:
+        - empty
+
+pytorch_quatization:
+    awq:
+        - empty
+    w8a8:
+        - empty
+    no_kvint4:
+        - Qwen/Qwen3-0.6B-FP8
+        - Qwen/Qwen3-1.7B-FP8
+        - Qwen/Qwen3-4B-FP8
+        - Qwen/Qwen3-8B-FP8
+        - Qwen/Qwen3-14B-FP8
+        - Qwen/Qwen3-235B-A22B
+        - Qwen/Qwen3-235B-A22B-FP8
+        - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-FP8
+        - Qwen/Qwen3-32B
+        - Qwen/Qwen3-32B-FP8
+    no_kvint8:
+        - empty
diff --git a/autotest/config.yaml b/autotest/config.yaml
index fab9a5af89..5844758229 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -33,8 +33,8 @@ tp_config:
     MiniCPM-V-2_6: 2
     gemma-2-27b-it: 2
     InternVL2-Llama3-76B-AWQ: 4
-    gpt-oss-20b: 2
-    gpt-oss-120b: 4
+    gpt-oss-20b-bf16: 2
+    gpt-oss-120b-bf16: 4
 
 
 turbomind_chat_model:
@@ -139,8 +139,8 @@ pytorch_chat_model:
     - Qwen/Qwen2.5-VL-32B-Instruct
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - openai/gpt-oss-20b
-    - openai/gpt-oss-120b
+    - lmsys/gpt-oss-20b-bf16
+    - lmsys/gpt-oss-120b-bf16
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-3-12b-it
@@ -368,5 +368,5 @@ benchmark_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
     - deepseek-ai/DeepSeek-V2-Lite-Chat
-    - openai/gpt-oss-20b
-    - openai/gpt-oss-120b
+    - lmsys/gpt-oss-20b-bf16
+    - lmsys/gpt-oss-120b-bf16
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 5dbcb6256a..f6ce5acfdd 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -305,6 +305,7 @@ def test_hf_pytorch_chat_pr(config, model, cli_case_config):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
@@ -328,6 +329,7 @@ def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf'])
 def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -349,6 +351,7 @@ def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat'])
 def test_pytorch_chat_with_lora_tp2(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index c04d0e1e26..82c747358b 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -12,8 +12,7 @@
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -22,8 +21,7 @@ def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, wor
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator}')
+                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -103,8 +101,7 @@ def test_hf_turbomind_chat_tp8(config, model, communicator, cli_case_config, wor
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_kvint4_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -114,7 +111,7 @@ def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_conf
                                                  model,
                                                  'turbomind',
                                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator} --quant-policy 4')
+                                                 extra='--quant-policy 4')
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -172,8 +169,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_kvint8_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -183,7 +179,7 @@ def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_conf
                                                  model,
                                                  'turbomind',
                                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator} --quant-policy 8')
+                                                 extra='--quant-policy 8')
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -261,12 +257,12 @@ def test_hf_turbomind_chat_kvint4_tp8(config, model, communicator, cli_case_conf
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_fallback_backend_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -275,8 +271,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator}')
+                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -288,12 +283,12 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
     if 'coder' in model:
         usercase = 'code_testcase'
@@ -303,7 +298,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat
                                                  model,
                                                  'turbomind',
                                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator} --quant-policy 8')
+                                                 extra='--quant-policy 8')
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -315,6 +310,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli_case_config, worker_id):
@@ -337,6 +333,7 @@ def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicator, cli_case_config, worker_id):
@@ -361,16 +358,14 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id):
+def test_hf_turbomind_base_tp1(config, model, cli_case_config, worker_id):
     usercase = 'base_testcase'
     result, chat_log, msg = hf_command_line_test(config,
                                                  usercase,
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id),
-                                                 extra=f'--communicator {communicator}')
+                                                 cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
 
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
@@ -410,6 +405,28 @@ def test_hf_turbomind_base_tp2(config, model, communicator, cli_case_config, wor
 ])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config):
+    usercase = 'chat_testcase'
+    result, chat_log, msg = hf_command_line_test(config,
+                                                 usercase,
+                                                 cli_case_config.get(usercase),
+                                                 model,
+                                                 'turbomind',
+                                                 cuda_prefix='CUDA_VISIBLE_DEVICES=5,6',
+                                                 extra=f'--communicator {communicator}')
+
+    if chat_log is not None:
+        allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
+
+    assert result, msg
+
+
+@pytest.mark.order(10)
+@pytest.mark.usefixtures('cli_case_config')
+@pytest.mark.hf_turbomind_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.pr_test
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3-8B'])
+def test_hf_turbomind_chat_pr_gpu1(config, model, cli_case_config):
     usercase = 'chat_testcase'
     device_type = os.environ.get('DEVICE', 'cuda')
     if device_type == 'ascend':
@@ -421,9 +438,7 @@ def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config):
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix=f'{env_var}5,6',
-                                                 extra=f'--communicator {communicator}')
-
+                                                 cuda_prefix=env_var + '5,6')
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
 
@@ -434,6 +449,7 @@ def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_turbomind_chat_tp1(config, model, cli_case_config, worker_id):
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 14285f3c91..eaa7942e5c 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -7,7 +7,7 @@
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
 from lmdeploy.utils import is_bf16_supported
 
-gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 
 
 def _is_bf16_supported_by_device():
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 5a649a1cca..62662f0b03 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -11,7 +11,7 @@
 from lmdeploy.vl.constants import IMAGE_TOKEN
 from lmdeploy.vl.utils import encode_image_base64
 
-gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 
 PIC1 = 'tiger.jpeg'
 PIC2 = 'human-pose.jpg'
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index b9a6939675..2ddc9240da 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -161,6 +161,7 @@ def test_pipeline_chat_pytorch_pr(config, common_case_config, model, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id):
@@ -175,6 +176,7 @@ def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf'])
 def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, worker_id):
@@ -190,6 +192,7 @@ def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat'])
 def test_pipeline_chat_pytorch_with_lora_tp2(config, common_case_config, model, worker_id):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 56caa2e6e7..f7a93db09a 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -12,11 +12,10 @@
 @pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id)
-    run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator})
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {})
 
 
 @pytest.mark.order(6)
@@ -65,14 +64,10 @@ def test_pipeline_chat_tp8(config, common_case_config, model, communicator, work
 @pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id)
-    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
-        'quant_policy': 4,
-        'communicator': communicator
-    })
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 4})
 
 
 @pytest.mark.order(6)
@@ -116,14 +111,10 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato
 @pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id)
-    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
-        'quant_policy': 8,
-        'communicator': communicator
-    })
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 8})
 
 
 @pytest.mark.order(6)
@@ -179,21 +170,16 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_chat_test(config,
-                           common_case_config,
-                           model,
-                           'turbomind',
-                           worker_id, {'communicator': communicator},
-                           is_smoke=True)
+    run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {}, is_smoke=True)
 
 
 @pytest.mark.order(6)
@@ -201,12 +187,12 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('model', [
     'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
     'microsoft/Phi-3-mini-4k-instruct-inner-w8a8'
 ])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
@@ -214,10 +200,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m
                            common_case_config,
                            model,
                            'turbomind-kvint',
-                           worker_id, {
-                               'quant_policy': 8,
-                               'communicator': communicator
-                           },
+                           worker_id, {'quant_policy': 8},
                            is_smoke=True)
 
 
@@ -226,6 +209,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id):
@@ -245,6 +229,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id):
@@ -286,6 +271,7 @@ def test_pipeline_chat_pr(config, common_case_config, model, communicator, worke
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pipeline_chat_tp1(config, common_case_config, model, worker_id):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 44ded4473f..bcfd071eba 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -15,11 +15,10 @@
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id)
-    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator})
+    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {})
 
 
 @pytest.mark.order(6)
@@ -57,14 +56,10 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id)
-    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
-        'quant_policy': 4,
-        'communicator': communicator
-    })
+    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4})
 
 
 @pytest.mark.order(6)
@@ -105,14 +100,10 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model'))
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id)
-    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
-        'quant_policy': 8,
-        'communicator': communicator
-    })
+    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8})
 
 
 @pytest.mark.order(6)
@@ -151,33 +142,26 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True)
+    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True)
 
 
 @pytest.mark.order(6)
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id):
+def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_pipeline_vl_chat_test(config,
-                              model,
-                              BACKEND_KVINT,
-                              worker_id, {
-                                  'quant_policy': 8,
-                                  'communicator': communicator
-                              },
-                              is_smoke=True)
+    run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True)
 
 
 @pytest.mark.order(6)
@@ -207,8 +191,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator,
 @pytest.mark.parametrize(
     'model',
     ['liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B', 'OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL3-8B'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_pipeline_pr_test(config, model, communicator, worker_id):
+def test_pipeline_pr_test(config, model, worker_id):
     device_type = os.environ.get('DEVICE', 'cuda')
     if device_type == 'ascend':
         env_var = 'ASCEND_RT_VISIBLE_DEVICES'
@@ -216,4 +199,4 @@ def test_pipeline_pr_test(config, model, communicator, worker_id):
         env_var = 'CUDA_VISIBLE_DEVICES'
     if 'gw' in worker_id:
         os.environ[f'{env_var}'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5)
-    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True)
+    run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index 6c48007565..9f6c747edb 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -176,6 +176,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'Qwen/Qwen2.5-7B-Instruct',
     'cuda_prefix': None,
@@ -194,6 +195,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'meta-llama/Llama-2-7b-chat-hf',
     'cuda_prefix': None,
@@ -212,6 +214,7 @@ def test_restful_chat_with_lora_tp1(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment',
                          [{
                              'model': 'baichuan-inc/Baichuan2-13B-Chat',
@@ -232,6 +235,7 @@ def test_restful_chat_with_lora_tp2(config, common_case_config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
@@ -253,6 +257,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
@@ -274,6 +279,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-7b-chat',
@@ -301,6 +307,7 @@ def test_restful_chat_tools_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-20b-chat',
@@ -322,6 +329,7 @@ def test_restful_chat_tools_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_4
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct',
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index b692bd17b5..daf2664662 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -80,7 +80,7 @@ def test_restful_chat_tp8(config, common_case_config, worker_id):
 
 def getKvintModelList(tp_num, quant_policy):
     model_list = []
-    for communicator in get_communicator_list():
+    for communicator in get_communicator_list(tp_num):
         model_list += [{
             'model': item,
             'cuda_prefix': None,
@@ -180,6 +180,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'microsoft/Phi-3-mini-4k-instruct',
@@ -200,37 +201,19 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id):
         'model': 'microsoft/Phi-3-mini-4k-instruct',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --communicator native'
-    },
-    {
-        'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
-        'cuda_prefix': None,
-        'tp_num': 1,
-        'extra': ' --communicator native'
-    },
-    {
-        'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8',
-        'cuda_prefix': None,
-        'tp_num': 1,
-        'extra': ' --communicator native'
-    },
-    {
-        'model': 'microsoft/Phi-3-mini-4k-instruct',
-        'cuda_prefix': None,
-        'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
 ],
                          indirect=True)
@@ -246,6 +229,7 @@ def test_restful_chat_fallback_backend_tp1(config, common_case_config, worker_id
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'google/gemma-2-27b-it',
@@ -357,6 +341,7 @@ def test_restful_logprobs(worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'Qwen/Qwen2.5-7B-Instruct',
     'cuda_prefix': None,
@@ -376,6 +361,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
@@ -397,6 +383,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
@@ -418,6 +405,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-7b-chat',
@@ -445,6 +433,7 @@ def test_restful_chat_tools_tp1(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'internlm/internlm2_5-20b-chat',
@@ -466,6 +455,7 @@ def test_restful_chat_tools_tp2(config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_4
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct',
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
index 2cfbc00020..a98fbbfd6c 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -19,7 +19,7 @@ def prepare_environment(request, config, worker_id):
 
 def getModelList(tp_num):
     model_list = []
-    for communicator in get_communicator_list():
+    for communicator in get_communicator_list(tp_num):
         model_list += [{
             'model': item,
             'cuda_prefix': None,
@@ -65,7 +65,7 @@ def test_restful_chat_tp4(config, worker_id):
 
 def getKvintModelList(tp_num, quant_policy: int = None):
     model_list = []
-    for communicator in get_communicator_list():
+    for communicator in get_communicator_list(tp_num):
         model_list += [{
             'model': item,
             'cuda_prefix': None,
@@ -146,6 +146,7 @@ def test_restful_chat_kvint8_tp4(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.other
 @pytest.mark.parametrize('prepare_environment', [
     {
         'model': 'OpenGVLab/InternVL2-4B',
@@ -171,25 +172,25 @@ def test_restful_chat_kvint8_tp4(config, worker_id):
         'model': 'OpenGVLab/InternVL2-4B',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'Qwen/Qwen2.5-VL-7B-Instruct',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'THUDM/glm-4v-9b',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
     {
         'model': 'THUDM/glm-4v-9b-inner-4bits',
         'cuda_prefix': None,
         'tp_num': 1,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8'
     },
 ],
                          indirect=True)
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 0df8858b2c..51de106840 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -83,8 +83,8 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type:
     return case_list
 
 
-def get_communicator_list():
-    if _is_bf16_supported_by_device():
+def get_communicator_list(tp_num: int = None):
+    if tp_num != 1 and _is_bf16_supported_by_device():
         return ['native', 'nccl']
     return ['nccl']
 
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index a1a9a3c512..fabc074d37 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -299,13 +299,14 @@ def internvl_vl_testcase(output_text, f, lang: str = 'en'):
             assert case_result, f'reason: separate images: panda should in {response}'
     with allure.step(f'internvl-separate-images2-{lang}'):
         response = get_response_from_output(output_text, f'internvl-separate-images2-{lang}')
-        case_result = any(word in response.lower() for word in ['panda', '熊猫', 'same', 'different', 'difference'])
+        case_result = any(word in response.lower()
+                          for word in ['panda', '熊猫', 'same', 'different', 'difference', 'identical'])
         f.writelines(f'internvl-separate-images2-{lang} result: {case_result}, reason: panda should in {response} \n')
         with assume:
             assert case_result, f'reason: separate images2: panda should in {response}'
     with allure.step(f'internvl-video-{lang}'):
         response = get_response_from_output(output_text, f'internvl-video-{lang}')
-        case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food'])
+        case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food', 'hold'])
         f.writelines(f'internvl-video-{lang} result: {case_result}, reason: panda should in {response} \n')
         with assume:
             assert case_result, f'reason: video: panda should in {response}'
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 5aca937681..88c9468823 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -148,7 +148,7 @@ def run_all_step(config, cases_info, worker_id: str = '', port: int = DEFAULT_PO
 
         case_info = cases_info.get(case)
 
-        with allure.step(case + ' step2 - restful_test - openai chat'):
+        with allure.step(case + ' restful_test - openai chat'):
             restful_result, restful_log, msg = open_chat_test(config, case, case_info, model, http_url, worker_id)
             allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT)
         with assume: