From e4fe55362ded353ef6b88f96fa021095bd03c627 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Sep 2025 10:41:56 +0800 Subject: [PATCH 01/13] add h800 --- .github/workflows/benchmark.yml | 4 +- .github/workflows/daily_ete_test.yml | 4 +- .github/workflows/daily_ete_test_3090.yml | 8 +- .github/workflows/daily_ete_test_5080.yml | 18 +- .github/workflows/daily_ete_test_h800.yml | 388 ++++++++++++++++++++++ autotest/config-h800.yaml | 118 +++++++ 6 files changed, 523 insertions(+), 17 deletions(-) create mode 100644 .github/workflows/daily_ete_test_h800.yml create mode 100644 autotest/config-h800.yaml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0ee4db9a28..62d2e19ca5 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -27,7 +27,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} @@ -42,7 +42,7 @@ jobs: env: PYTHON_VERSION: ${{ matrix.pyver }} PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda11.8 + DOCKER_TAG: cuda12.4 steps: - name: Checkout repository uses: actions/checkout@v3 diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index f10bc2993b..8a90df4309 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -44,7 +44,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy @@ -64,7 +64,7 @@ jobs: env: PYTHON_VERSION: ${{ matrix.pyver }} PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda11.8 + DOCKER_TAG: cuda12.4 steps: - name: Checkout repository uses: actions/checkout@v3 diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 21d200a405..6c0a0942c5 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -153,7 +153,7 @@ jobs: python3 -m pip install -r requirements/lite.txt - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm python3 -m pip install -r requirements/test.txt - name: Check env @@ -226,7 +226,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -290,7 +290,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -370,7 +370,7 @@ jobs: run: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Get coverage report run: | diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml index 7d9b250385..bd12b0beff 100644 --- a/.github/workflows/daily_ete_test_5080.yml +++ b/.github/workflows/daily_ete_test_5080.yml @@ -92,7 +92,7 @@ jobs: download_pkgs: needs: linux-build if: ${{!cancelled()}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] timeout-minutes: 50 container: image: openmmlab/lmdeploy:latest-cu12.8 @@ -129,7 +129,7 @@ jobs: test_quantization: needs: download_pkgs if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] timeout-minutes: 150 env: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA @@ -153,7 +153,7 @@ jobs: python3 -m pip install -r requirements/lite.txt - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm python3 -m pip install -r requirements/test.txt - name: Check env @@ -188,7 +188,7 @@ jobs: chmod -R 777 $workdir test_tools: if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] needs: test_quantization timeout-minutes: 300 strategy: @@ -225,7 +225,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -265,7 +265,7 @@ jobs: chmod -R 777 $workdir test_restful: if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] needs: test_quantization strategy: fail-fast: false @@ -289,7 +289,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -353,7 +353,7 @@ jobs: chmod -R 777 $workdir get_coverage_report: if: ${{!cancelled() && success()}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] needs: [test_tools, test_restful] timeout-minutes: 5 container: @@ -368,7 +368,7 @@ jobs: run: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Get coverage report run: | diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml new file mode 100644 index 0000000000..dfcbe2d718 --- /dev/null +++ b/.github/workflows/daily_ete_test_h800.yml @@ -0,0 +1,388 @@ +name: daily_ete_test_h800 + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: llm, vllm. Default contains all models' + type: string + default: "['llm','mllm']" + function: + required: true + description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' + type: string + default: '["pipeline", "restful", "chat"]' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['quant', 'tools','restful','pipeline']" + schedule: + - cron: '00 14 * * 0-4' + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }} + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.8 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, h800-r1] + timeout-minutes: 50 + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme2/share:/nvme2/share + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + + test_quantization: + needs: download_pkgs + if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + runs-on: [self-hosted, h800-r1] + timeout-minutes: 150 + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme2/share:/nvme2/share + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install auto_gptq matplotlib attrdict + python3 -m pip install -r requirements/lite.txt + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - quantization w4a16 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - quantization w8a8 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch') + run: | + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_tools: + if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + runs-on: [self-hosted, h800-r1] + needs: test_quantization + timeout-minutes: 300 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} + function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind + model: mllm + function: chat + - backend: pytorch + model: mllm + function: chat + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme2/share:/nvme2/share + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.function == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.function == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_restful: + if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + runs-on: [self-hosted, h800-r1] + needs: test_quantization + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ['Intern-S1', 'internlm2_5-20b-chat', 'internlm2_5-20b'] + include: + - tp: 8 + model: Intern-S1 + timeout-minutes: 60 + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme2/share:/nvme2/share + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Start restful api + if: matrix.model != 'internlm2_5-20b' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/${{matrix.model}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_start_chat_restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Test lmdeploy - restful api + if: matrix.model == 'Intern-S1' + timeout-minutes: 30 + run: | + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.model}}-${{matrix.backend}}_ ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: matrix.model != 'internlm2_5-20b' + run: | + kill -15 "$restful_pid" + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + get_coverage_report: + if: ${{!cancelled() && success()}} + runs-on: [self-hosted, h800-r1] + needs: [test_tools, test_restful] + timeout-minutes: 5 + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml new file mode 100644 index 0000000000..11c771f5a4 --- /dev/null +++ b/autotest/config-h800.yaml @@ -0,0 +1,118 @@ +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +dst_path: /nvme/qa_test_models/autotest_model +log_path: /nvme/qa_test_models/autotest_model/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: 5080 + +tp_config: + Intern-S1: 8 + Qwen3-235B-A22B: 8 + Qwen3-235B-A22B-FP8: 4 + Qwen3-30B-A3B: 2 + Qwen3-32B: 2 + gpt-oss-120b: 2 + + +turbomind_chat_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + - openai/gpt-oss-120b + - openai/gpt-oss-20b + +pytorch_chat_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + - openai/gpt-oss-120b + - openai/gpt-oss-20b + +turbomind_vl_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +pytorch_vl_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +turbomind_base_model: + - internlm/Intern-S1-mini + - Qwen/Qwen3-4B-FP8 + - openai/gpt-oss-20b + +pytorch_base_model: + - internlm/Intern-S1-mini + - Qwen/Qwen3-4B-FP8 + - openai/gpt-oss-20b + +turbomind_quatization: + no_awq: + - empty + gptq: + - empty + no_kvint4: + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + no_kvint8: + - empty + +pytorch_quatization: + awq: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + - openai/gpt-oss-120b + - openai/gpt-oss-20b + w8a8: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + no_kvint4: + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + no_kvint8: + - empty From 96be64daee2ef4df834703754949954c111bf610 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Sep 2025 10:49:34 +0800 Subject: [PATCH 02/13] update --- autotest/config-h800.yaml | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index 11c771f5a4..7a47c5e8b7 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -1,10 +1,10 @@ -model_path: /nvme/qa_test_models -resource_path: /nvme/qa_test_models/resource -dst_path: /nvme/qa_test_models/autotest_model -log_path: /nvme/qa_test_models/autotest_model/log -benchmark_path: /nvme/qa_test_models/benchmark-reports -dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json -env_tag: 5080 +model_path: /nvme1/qa_test_models +resource_path: /nvme1/qa_test_models/resource +dst_path: /nvme1/qa_test_models/autotest_model +log_path: /nvme1/qa_test_models/autotest_model/log +benchmark_path: /nvme1/qa_test_models/benchmark-reports +dataset_path: /nvme1/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: h800 tp_config: Intern-S1: 8 @@ -13,7 +13,8 @@ tp_config: Qwen3-30B-A3B: 2 Qwen3-32B: 2 gpt-oss-120b: 2 - + gpt-oss-120b-BF16: 4 + gpt-oss-20b-BF16: 2 turbomind_chat_model: - internlm/Intern-S1 @@ -44,8 +45,8 @@ pytorch_chat_model: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 - - openai/gpt-oss-120b - - openai/gpt-oss-20b + - unsloth/gpt-oss-120b-BF16 + - unsloth/gpt-oss-20b-BF16 turbomind_vl_model: - internlm/Intern-S1 From a1e5bf747754adba602ffce5d8cbe2a1c8df9386 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Sep 2025 11:02:32 +0800 Subject: [PATCH 03/13] update --- .github/workflows/daily_ete_test_h800.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index dfcbe2d718..2660d3844e 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -100,6 +100,7 @@ jobs: volumes: - /nvme/qa_test_models:/nvme/qa_test_models - /nvme2/share:/nvme2/share + - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Clone repository @@ -142,6 +143,7 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models - /nvme2/share:/nvme2/share + - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Copy repository and Artifacts @@ -216,6 +218,7 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models - /nvme2/share:/nvme2/share + - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Copy repository and Artifacts @@ -303,6 +306,7 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models - /nvme2/share:/nvme2/share + - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Copy repository and Artifacts From 6c70f277b3a6d2afd0c647d6a49ccf071f6de336 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Sep 2025 11:11:56 +0800 Subject: [PATCH 04/13] update --- .github/workflows/daily_ete_test_h800.yml | 5 +++++ autotest/config-h800.yaml | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index 2660d3844e..b58261108f 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -99,6 +99,7 @@ jobs: options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro @@ -142,6 +143,7 @@ jobs: volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro @@ -217,6 +219,7 @@ jobs: volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro @@ -305,6 +308,7 @@ jobs: volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro @@ -366,6 +370,7 @@ jobs: volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Copy repository and Artifacts diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index 7a47c5e8b7..275a3fdba9 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -1,9 +1,9 @@ -model_path: /nvme1/qa_test_models -resource_path: /nvme1/qa_test_models/resource -dst_path: /nvme1/qa_test_models/autotest_model -log_path: /nvme1/qa_test_models/autotest_model/log -benchmark_path: /nvme1/qa_test_models/benchmark-reports -dataset_path: /nvme1/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +dst_path: /nvme/qa_test_models/autotest_model +log_path: /nvme/qa_test_models/autotest_model/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json env_tag: h800 tp_config: From 1c4c42ae82d945720c1deb389c3f5204cdc1fded Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Sep 2025 11:23:45 +0800 Subject: [PATCH 05/13] update mark --- .github/workflows/daily_ete_test_h800.yml | 24 +++++++++---------- .../chat/test_command_chat_hf_pytorch.py | 3 +++ .../chat/test_command_chat_hf_turbomind.py | 5 ++++ .../test_pipeline_chat_pytorch_llm.py | 3 +++ .../test_pipeline_chat_turbomind_llm.py | 5 ++++ .../test_pipeline_chat_turbomind_mllm.py | 2 ++ .../test_restful_chat_hf_pytorch_llm.py | 8 +++++++ .../test_restful_chat_hf_turbomind_llm.py | 8 +++++++ 8 files changed, 46 insertions(+), 12 deletions(-) diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index b58261108f..1c2669d27e 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -247,37 +247,37 @@ jobs: continue-on-error: true if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' run: | - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline continue-on-error: true if: matrix.function == 'pipeline' run: | - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful continue-on-error: true if: matrix.function == 'restful' run: | - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 3c13cb1ebf..9f84eb9d8f 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -296,6 +296,7 @@ def test_hf_pytorch_chat_pr(config, model, cli_case_config): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id): os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True' @@ -319,6 +320,7 @@ def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf']) def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -340,6 +342,7 @@ def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat']) def test_pytorch_chat_with_lora_tp2(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index d7cb3770ed..966f87efb6 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -261,6 +261,7 @@ def test_hf_turbomind_chat_kvint4_tp8(config, model, communicator, cli_case_conf @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' @@ -288,6 +289,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' @@ -315,6 +317,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli_case_config, worker_id): @@ -337,6 +340,7 @@ def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicator, cli_case_config, worker_id): @@ -454,6 +458,7 @@ def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_turbomind_chat_tp1(config, model, cli_case_config, worker_id): os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True' diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index dca119649e..dec6c31798 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -157,6 +157,7 @@ def test_pipeline_chat_pytorch_pr(config, common_case_config, model, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id): @@ -171,6 +172,7 @@ def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf']) def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, worker_id): @@ -186,6 +188,7 @@ def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat']) def test_pipeline_chat_pytorch_with_lora_tp2(config, common_case_config, model, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index c92d6aa148..647e310863 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -179,6 +179,7 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' @@ -201,6 +202,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' @@ -226,6 +228,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id): @@ -245,6 +248,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id): @@ -286,6 +290,7 @@ def test_pipeline_chat_pr(config, common_case_config, model, communicator, worke @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pipeline_chat_tp1(config, common_case_config, model, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index f14cdf605d..d095e4fd98 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -150,6 +150,7 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id): @@ -163,6 +164,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_ @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id): diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index eaf574c591..f1aff303cd 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -172,6 +172,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'Qwen/Qwen2.5-7B-Instruct', 'cuda_prefix': None, @@ -190,6 +191,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'meta-llama/Llama-2-7b-chat-hf', 'cuda_prefix': None, @@ -208,6 +210,7 @@ def test_restful_chat_with_lora_tp1(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'baichuan-inc/Baichuan2-13B-Chat', @@ -228,6 +231,7 @@ def test_restful_chat_with_lora_tp2(config, common_case_config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B', @@ -249,6 +253,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', @@ -270,6 +275,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-7b-chat', @@ -297,6 +303,7 @@ def test_restful_chat_tools_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-20b-chat', @@ -318,6 +325,7 @@ def test_restful_chat_tools_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_4 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct', diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index b692bd17b5..4f61b88438 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -180,6 +180,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'microsoft/Phi-3-mini-4k-instruct', @@ -246,6 +247,7 @@ def test_restful_chat_fallback_backend_tp1(config, common_case_config, worker_id @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'google/gemma-2-27b-it', @@ -357,6 +359,7 @@ def test_restful_logprobs(worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'Qwen/Qwen2.5-7B-Instruct', 'cuda_prefix': None, @@ -376,6 +379,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B', @@ -397,6 +401,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', @@ -418,6 +423,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-7b-chat', @@ -445,6 +451,7 @@ def test_restful_chat_tools_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-20b-chat', @@ -466,6 +473,7 @@ def test_restful_chat_tools_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_4 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct', From 8dcfc9d1b46904f407b8f60c14196bf190fdbd26 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Sep 2025 15:17:47 +0800 Subject: [PATCH 06/13] update test image for cu12.4 --- .github/workflows/daily_ete_test.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 8a90df4309..3e78d3c957 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -96,7 +96,7 @@ jobs: runs-on: [self-hosted, linux-a100] timeout-minutes: 50 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -136,7 +136,7 @@ jobs: MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -219,7 +219,7 @@ jobs: MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -324,7 +324,7 @@ jobs: model: Intern-S1 timeout-minutes: 60 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -408,7 +408,7 @@ jobs: needs: test_quantization timeout-minutes: 120 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -465,7 +465,7 @@ jobs: needs: test_quantization timeout-minutes: 120 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -520,7 +520,7 @@ jobs: matrix: evaluate_type: ['chat', 'base'] container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -594,7 +594,7 @@ jobs: timeout-minutes: 5 runs-on: [self-hosted, linux-a100] container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -619,7 +619,7 @@ jobs: needs: [test_tools, test_restful, test_pipeline, test_benchmark] timeout-minutes: 5 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip From 3528a7e464dd526a5ded460afde67e218b45e842 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Sep 2025 19:47:26 +0800 Subject: [PATCH 07/13] update --- .github/workflows/daily_ete_test_h800.yml | 73 ++--------------------- autotest/config-h800.yaml | 27 ++++----- 2 files changed, 18 insertions(+), 82 deletions(-) diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index 1c2669d27e..9036a039a6 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -128,73 +128,10 @@ jobs: if: ${{inputs.offline_mode}} run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} - test_quantization: - needs: download_pkgs - if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} - runs-on: [self-hosted, h800-r1] - timeout-minutes: 150 - env: - PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA - MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub - MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules - container: - image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/qa_test_models:/nvme/qa_test_models - - /nvme1/qa_test_models:/nvme1/qa_test_models - - /nvme2/share:/nvme2/share - - /mnt/137_nvme4:/mnt/137_nvme4 - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - dependency - run: | - python3 -m pip install auto_gptq matplotlib attrdict - python3 -m pip install -r requirements/lite.txt - - name: Install lmdeploy - run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm - python3 -m pip install -r requirements/test.txt - - name: Check env - run: | - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - - name: Test lmdeploy - quantization w4a16 - continue-on-error: true - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') - run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - quantization w8a8 - continue-on-error: true - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch') - run: | - pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Clear workfile - if: always() - run: | - chmod -R 777 $REPORT_DIR - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir - test_tools: - if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, h800-r1] - needs: test_quantization + needs: download_pkgs timeout-minutes: 300 strategy: fail-fast: false @@ -290,14 +227,14 @@ jobs: chmod -R 777 $workdir test_restful: - if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} runs-on: [self-hosted, h800-r1] - needs: test_quantization + needs: download_pkgs strategy: fail-fast: false matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} - model: ['Intern-S1', 'internlm2_5-20b-chat', 'internlm2_5-20b'] + model: ['Intern-S1'] include: - tp: 8 model: Intern-S1 diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index 275a3fdba9..318b6dc465 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -68,10 +68,8 @@ pytorch_base_model: turbomind_quatization: no_awq: - - empty - gptq: - - empty - no_kvint4: + - internlm/Intern-S1 + - internlm/Intern-S1-mini - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 @@ -82,13 +80,11 @@ turbomind_quatization: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 - no_kvint8: + - openai/gpt-oss-120b + - openai/gpt-oss-20b + gptq: - empty - -pytorch_quatization: - awq: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + no_kvint4: - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 @@ -99,11 +95,14 @@ pytorch_quatization: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 - - openai/gpt-oss-120b - - openai/gpt-oss-20b + no_kvint8: + - empty + +pytorch_quatization: + awq: + - empty w8a8: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty no_kvint4: - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 From 1b89cb27301fbdcb56787b0f11db07df330a0553 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 17 Sep 2025 10:22:56 +0800 Subject: [PATCH 08/13] update --- .github/workflows/daily_ete_test_h800.yml | 4 ++-- autotest/config.yaml | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index 9036a039a6..ba2a9c6560 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -37,7 +37,7 @@ on: required: true description: 'regression functions' type: string - default: "['quant', 'tools','restful','pipeline']" + default: "['tools','restful']" schedule: - cron: '00 14 * * 0-4' @@ -115,7 +115,7 @@ jobs: run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml - name: Copy repository - offline if: ${{inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml - name: Download Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 diff --git a/autotest/config.yaml b/autotest/config.yaml index fab9a5af89..5844758229 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -33,8 +33,8 @@ tp_config: MiniCPM-V-2_6: 2 gemma-2-27b-it: 2 InternVL2-Llama3-76B-AWQ: 4 - gpt-oss-20b: 2 - gpt-oss-120b: 4 + gpt-oss-20b-bf16: 2 + gpt-oss-120b-bf16: 4 turbomind_chat_model: @@ -139,8 +139,8 @@ pytorch_chat_model: - Qwen/Qwen2.5-VL-32B-Instruct - Qwen/Qwen2-VL-2B-Instruct - Qwen/Qwen2-VL-7B-Instruct - - openai/gpt-oss-20b - - openai/gpt-oss-120b + - lmsys/gpt-oss-20b-bf16 + - lmsys/gpt-oss-120b-bf16 - mistralai/Mistral-7B-Instruct-v0.3 - mistralai/Mixtral-8x7B-Instruct-v0.1 - google/gemma-3-12b-it @@ -368,5 +368,5 @@ benchmark_model: - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-72B-Instruct - deepseek-ai/DeepSeek-V2-Lite-Chat - - openai/gpt-oss-20b - - openai/gpt-oss-120b + - lmsys/gpt-oss-20b-bf16 + - lmsys/gpt-oss-120b-bf16 From 06d0532625e61a0fab03659bf820c599c13479c3 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 17 Sep 2025 10:30:09 +0800 Subject: [PATCH 09/13] update --- .github/workflows/daily_ete_test.yml | 12 ++++++------ .github/workflows/daily_ete_test_3090.yml | 6 +++--- .github/workflows/daily_ete_test_5080.yml | 6 +++--- .github/workflows/daily_ete_test_h800.yml | 4 ++-- .../restful/test_restful_chat_hf_turbomind_mllm.py | 1 + 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 3e78d3c957..8b8969ada4 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -168,7 +168,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true @@ -251,7 +251,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true @@ -352,7 +352,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api if: matrix.model != 'internlm2_5-20b' @@ -436,7 +436,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - interface pipeline case run: | @@ -493,7 +493,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test benchmark script run: | @@ -560,7 +560,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Setup paths for evaluation run: | diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 6c0a0942c5..9243887ecf 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -163,7 +163,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true @@ -235,7 +235,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true @@ -299,7 +299,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api turbomind if: matrix.backend == 'turbomind' diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml index bd12b0beff..3a080f2615 100644 --- a/.github/workflows/daily_ete_test_5080.yml +++ b/.github/workflows/daily_ete_test_5080.yml @@ -163,7 +163,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true @@ -234,7 +234,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true @@ -298,7 +298,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api turbomind if: matrix.backend == 'turbomind' diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index ba2a9c6560..d6ad6547fd 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -178,7 +178,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true @@ -267,7 +267,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api if: matrix.model != 'internlm2_5-20b' diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py index 2cfbc00020..c238ccd962 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py @@ -146,6 +146,7 @@ def test_restful_chat_kvint8_tp4(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'OpenGVLab/InternVL2-4B', From 2e128dc6dd09ddf6e6c486b1c76c5b41a29db6cb Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 17 Sep 2025 11:21:55 +0800 Subject: [PATCH 10/13] update --- .github/workflows/benchmark.yml | 2 +- .github/workflows/daily_ete_test.yml | 5 ++--- .github/workflows/daily_ete_test_h800.yml | 2 +- .github/workflows/evaluate.yml | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 62d2e19ca5..00f7e64b61 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -108,7 +108,7 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 8b8969ada4..1c2f0b549d 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -550,8 +550,7 @@ jobs: run: | git clone --depth=1 https://github.com/open-compass/opencompass.git cd opencompass - cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt - python3 -m pip install -e . + python3 -m pip install . echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - name: Check env run: | @@ -571,7 +570,7 @@ jobs: run: | export LMDEPLOY_DIR=$(pwd) - python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true - name: Evaluate base models if: matrix.evaluate_type == 'base' run: | diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index d6ad6547fd..43b9dc4417 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -169,7 +169,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 47e1929421..be64e8743f 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -136,7 +136,7 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} From a30c72d79d12e58d59a35e7cd6c3530142826ee9 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 17 Sep 2025 16:09:31 +0800 Subject: [PATCH 11/13] update --- .github/workflows/daily_ete_test_h800.yml | 2 +- autotest/config-h800.yaml | 2 +- autotest/tools/pipeline/llm_case.py | 2 +- autotest/tools/pipeline/mllm_case.py | 2 +- autotest/utils/run_restful_chat.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index 43b9dc4417..db6131845e 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -289,7 +289,7 @@ jobs: if: always() run: | chmod -R 777 $REPORT_DIR - chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + chmod -R 777 ${{env.REPORT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index 318b6dc465..1a3600a72d 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -9,7 +9,7 @@ env_tag: h800 tp_config: Intern-S1: 8 Qwen3-235B-A22B: 8 - Qwen3-235B-A22B-FP8: 4 + Qwen3-235B-A22B-FP8: 8 Qwen3-30B-A3B: 2 Qwen3-32B: 2 gpt-oss-120b: 2 diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 2de77d2bd3..17c3f58376 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -7,7 +7,7 @@ from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline from lmdeploy.utils import is_bf16_supported -gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10) def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None): diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 9689581ef9..050ae1e1b7 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -10,7 +10,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN from lmdeploy.vl.utils import encode_image_base64 -gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10) PIC1 = 'tiger.jpeg' PIC2 = 'human-pose.jpg' diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index bd7c7244a2..672a235ead 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -144,7 +144,7 @@ def run_all_step(config, cases_info, worker_id: str = '', port: int = DEFAULT_PO case_info = cases_info.get(case) - with allure.step(case + ' step2 - restful_test - openai chat'): + with allure.step(case + ' restful_test - openai chat'): restful_result, restful_log, msg = open_chat_test(config, case, case_info, model, http_url, worker_id) allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT) with assume: From b375a0a60c3ffc536756064c94a27c2b036e1ad9 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 18 Sep 2025 18:08:28 +0800 Subject: [PATCH 12/13] remove communicator native when tp=1 --- .github/workflows/daily_ete_test_h800.yml | 4 ++ autotest/config-h800.yaml | 7 ++- .../chat/test_command_chat_hf_turbomind.py | 37 ++++++---------- .../test_pipeline_chat_turbomind_llm.py | 39 +++++------------ .../test_pipeline_chat_turbomind_mllm.py | 43 ++++++------------- .../test_restful_chat_hf_turbomind_llm.py | 24 ++--------- .../test_restful_chat_hf_turbomind_mllm.py | 12 +++--- autotest/utils/config_utils.py | 4 +- autotest/utils/pipeline_chat.py | 5 ++- 9 files changed, 59 insertions(+), 116 deletions(-) diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index db6131845e..1dab90bebf 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -158,6 +158,8 @@ jobs: - /nvme/qa_test_models:/nvme/qa_test_models - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share + - /mnt/137_nvme2:/mnt/137_nvme2 + - /mnt/137_nvme3:/mnt/137_nvme3 - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: @@ -247,6 +249,8 @@ jobs: - /nvme/qa_test_models:/nvme/qa_test_models - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share + - /mnt/137_nvme2:/mnt/137_nvme2 + - /mnt/137_nvme3:/mnt/137_nvme3 - /mnt/137_nvme4:/mnt/137_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index 1a3600a72d..068d074452 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -27,6 +27,7 @@ turbomind_chat_model: - Qwen/Qwen3-235B-A22B - Qwen/Qwen3-235B-A22B-FP8 - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 - openai/gpt-oss-120b @@ -43,6 +44,7 @@ pytorch_chat_model: - Qwen/Qwen3-235B-A22B - Qwen/Qwen3-235B-A22B-FP8 - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 - unsloth/gpt-oss-120b-BF16 @@ -64,7 +66,7 @@ turbomind_base_model: pytorch_base_model: - internlm/Intern-S1-mini - Qwen/Qwen3-4B-FP8 - - openai/gpt-oss-20b + - unsloth/gpt-oss-20b-BF16 turbomind_quatization: no_awq: @@ -78,6 +80,7 @@ turbomind_quatization: - Qwen/Qwen3-235B-A22B - Qwen/Qwen3-235B-A22B-FP8 - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 - openai/gpt-oss-120b @@ -93,6 +96,7 @@ turbomind_quatization: - Qwen/Qwen3-235B-A22B - Qwen/Qwen3-235B-A22B-FP8 - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 no_kvint8: @@ -112,6 +116,7 @@ pytorch_quatization: - Qwen/Qwen3-235B-A22B - Qwen/Qwen3-235B-A22B-FP8 - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 no_kvint8: diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 966f87efb6..7e8250daa4 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -12,8 +12,7 @@ @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -22,8 +21,7 @@ def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, wor cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator}') + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -103,8 +101,7 @@ def test_hf_turbomind_chat_tp8(config, model, communicator, cli_case_config, wor @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_kvint4_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -114,7 +111,7 @@ def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_conf model, 'turbomind', cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator} --quant-policy 4') + extra='--quant-policy 4') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -172,8 +169,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_kvint8_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -183,7 +179,7 @@ def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_conf model, 'turbomind', cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator} --quant-policy 8') + extra='--quant-policy 8') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -266,8 +262,7 @@ def test_hf_turbomind_chat_kvint4_tp8(config, model, communicator, cli_case_conf 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_fallback_backend_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -276,8 +271,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator}') + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -294,8 +288,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -305,7 +298,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat model, 'turbomind', cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator} --quant-policy 8') + extra='--quant-policy 8') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -365,7 +358,6 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id): usercase = 'base_testcase' result, chat_log, msg = hf_command_line_test(config, @@ -373,8 +365,7 @@ def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, wor cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator}') + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -436,8 +427,7 @@ def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config): @pytest.mark.gpu_num_1 @pytest.mark.pr_test @pytest.mark.parametrize('model', ['OpenGVLab/InternVL3-8B']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config): +def test_hf_turbomind_chat_pr_gpu1(config, model, cli_case_config): usercase = 'chat_testcase' result, chat_log, msg = hf_command_line_test(config, @@ -445,8 +435,7 @@ def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config) cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix='CUDA_VISIBLE_DEVICES=5,6', - extra=f'--communicator {communicator}') + cuda_prefix='CUDA_VISIBLE_DEVICES=5,6') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index 647e310863..fcac93e935 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -12,11 +12,10 @@ @pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) - run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator}) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {}) @pytest.mark.order(6) @@ -65,14 +64,10 @@ def test_pipeline_chat_tp8(config, common_case_config, model, communicator, work @pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) - run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { - 'quant_policy': 4, - 'communicator': communicator - }) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 4}) @pytest.mark.order(6) @@ -116,14 +111,10 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato @pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) - run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 8}) @pytest.mark.order(6) @@ -184,17 +175,11 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_chat_test(config, - common_case_config, - model, - 'turbomind', - worker_id, {'communicator': communicator}, - is_smoke=True) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {}, is_smoke=True) @pytest.mark.order(6) @@ -207,8 +192,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) @@ -216,10 +200,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m common_case_config, model, 'turbomind-kvint', - worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }, + worker_id, {'quant_policy': 8}, is_smoke=True) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index d095e4fd98..bce455d508 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -14,11 +14,10 @@ @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_tp1(config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) - run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}) + run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}) @pytest.mark.order(6) @@ -56,14 +55,10 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_kvint4_tp1(config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) - run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { - 'quant_policy': 4, - 'communicator': communicator - }) + run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4}) @pytest.mark.order(6) @@ -104,14 +99,10 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) - run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }) + run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}) @pytest.mark.order(6) @@ -152,12 +143,11 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.other @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_tp1(config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True) + run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True) @pytest.mark.order(6) @@ -166,19 +156,11 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_ @pytest.mark.gpu_num_2 @pytest.mark.other @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_vl_chat_test(config, - model, - BACKEND_KVINT, - worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }, - is_smoke=True) + run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True) @pytest.mark.pipeline_chat @@ -188,8 +170,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, @pytest.mark.parametrize( 'model', ['liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B', 'OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL3-8B']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_pr_test(config, model, communicator, worker_id): +def test_pipeline_pr_test(config, model, worker_id): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5) - run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True) + run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True) diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index 4f61b88438..519ed6718f 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -201,37 +201,19 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id): 'model': 'microsoft/Phi-3-mini-4k-instruct', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --communicator native' - }, - { - 'model': 'microsoft/Phi-3-mini-4k-instruct', - 'cuda_prefix': None, - 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' - }, - { - 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', - 'cuda_prefix': None, - 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' - }, - { - 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8', - 'cuda_prefix': None, - 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, ], indirect=True) diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py index c238ccd962..a98fbbfd6c 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py @@ -19,7 +19,7 @@ def prepare_environment(request, config, worker_id): def getModelList(tp_num): model_list = [] - for communicator in get_communicator_list(): + for communicator in get_communicator_list(tp_num): model_list += [{ 'model': item, 'cuda_prefix': None, @@ -65,7 +65,7 @@ def test_restful_chat_tp4(config, worker_id): def getKvintModelList(tp_num, quant_policy: int = None): model_list = [] - for communicator in get_communicator_list(): + for communicator in get_communicator_list(tp_num): model_list += [{ 'model': item, 'cuda_prefix': None, @@ -172,25 +172,25 @@ def test_restful_chat_kvint8_tp4(config, worker_id): 'model': 'OpenGVLab/InternVL2-4B', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'Qwen/Qwen2.5-VL-7B-Instruct', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'THUDM/glm-4v-9b', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'THUDM/glm-4v-9b-inner-4bits', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, ], indirect=True) diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index c53e33bf0f..e9e49a3666 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -83,8 +83,8 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: return case_list -def get_communicator_list(): - if is_bf16_supported(): +def get_communicator_list(tp_num: int = None): + if tp_num != 1 and is_bf16_supported(): return ['native', 'nccl'] return ['nccl'] diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 01c686f7dd..8fc48d92b5 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -279,13 +279,14 @@ def internvl_vl_testcase(output_text, f, lang: str = 'en'): assert case_result, f'reason: separate images: panda should in {response}' with allure.step(f'internvl-separate-images2-{lang}'): response = get_response_from_output(output_text, f'internvl-separate-images2-{lang}') - case_result = any(word in response.lower() for word in ['panda', '熊猫', 'same', 'different', 'difference']) + case_result = any(word in response.lower() + for word in ['panda', '熊猫', 'same', 'different', 'difference', 'identical']) f.writelines(f'internvl-separate-images2-{lang} result: {case_result}, reason: panda should in {response} \n') with assume: assert case_result, f'reason: separate images2: panda should in {response}' with allure.step(f'internvl-video-{lang}'): response = get_response_from_output(output_text, f'internvl-video-{lang}') - case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food']) + case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food', 'hold']) f.writelines(f'internvl-video-{lang} result: {case_result}, reason: panda should in {response} \n') with assume: assert case_result, f'reason: video: panda should in {response}' From 75d2089ddfbbca47498c9d0664c717f31b820ef3 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 18 Sep 2025 18:26:13 +0800 Subject: [PATCH 13/13] fix lint --- .../chat/test_command_chat_hf_turbomind.py | 4 ++-- .../test_pipeline_chat_turbomind_mllm.py | 18 +++++++++--------- .../test_restful_chat_hf_turbomind_llm.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 991db6b383..82c747358b 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -358,7 +358,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model')) -def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_base_tp1(config, model, cli_case_config, worker_id): usercase = 'base_testcase' result, chat_log, msg = hf_command_line_test(config, usercase, @@ -438,7 +438,7 @@ def test_hf_turbomind_chat_pr_gpu1(config, model, cli_case_config): cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix='CUDA_VISIBLE_DEVICES=5,6') + cuda_prefix=env_var + '5,6') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index 7415243ec4..bcfd071eba 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -161,14 +161,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_vl_chat_test(config, - model, - BACKEND_KVINT, - worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }, - is_smoke=True) + run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True) @pytest.mark.order(6) @@ -181,7 +174,14 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, if 'gw' in worker_id: set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True) + run_pipeline_vl_chat_test(config, + model, + BACKEND_KVINT, + worker_id, { + 'quant_policy': 8, + 'communicator': communicator + }, + is_smoke=True) @pytest.mark.pipeline_chat diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index 519ed6718f..daf2664662 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -80,7 +80,7 @@ def test_restful_chat_tp8(config, common_case_config, worker_id): def getKvintModelList(tp_num, quant_policy): model_list = [] - for communicator in get_communicator_list(): + for communicator in get_communicator_list(tp_num): model_list += [{ 'model': item, 'cuda_prefix': None,