diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0ee4db9a28..00f7e64b61 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -27,7 +27,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} @@ -42,7 +42,7 @@ jobs: env: PYTHON_VERSION: ${{ matrix.pyver }} PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda11.8 + DOCKER_TAG: cuda12.4 steps: - name: Checkout repository uses: actions/checkout@v3 @@ -108,7 +108,7 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index f10bc2993b..1c2f0b549d 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -44,7 +44,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy @@ -64,7 +64,7 @@ jobs: env: PYTHON_VERSION: ${{ matrix.pyver }} PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda11.8 + DOCKER_TAG: cuda12.4 steps: - name: Checkout repository uses: actions/checkout@v3 @@ -96,7 +96,7 @@ jobs: runs-on: [self-hosted, linux-a100] timeout-minutes: 50 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -136,7 +136,7 @@ jobs: MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -168,7 +168,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true @@ -219,7 +219,7 @@ jobs: MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -251,7 +251,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true @@ -324,7 +324,7 @@ jobs: model: Intern-S1 timeout-minutes: 60 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -352,7 +352,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api if: matrix.model != 'internlm2_5-20b' @@ -408,7 +408,7 @@ jobs: needs: test_quantization timeout-minutes: 120 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -436,7 +436,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - interface pipeline case run: | @@ -465,7 +465,7 @@ jobs: needs: test_quantization timeout-minutes: 120 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -493,7 +493,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test benchmark script run: | @@ -520,7 +520,7 @@ jobs: matrix: evaluate_type: ['chat', 'base'] container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -550,8 +550,7 @@ jobs: run: | git clone --depth=1 https://github.com/open-compass/opencompass.git cd opencompass - cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt - python3 -m pip install -e . + python3 -m pip install . echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - name: Check env run: | @@ -560,7 +559,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Setup paths for evaluation run: | @@ -571,7 +570,7 @@ jobs: run: | export LMDEPLOY_DIR=$(pwd) - python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true - name: Evaluate base models if: matrix.evaluate_type == 'base' run: | @@ -594,7 +593,7 @@ jobs: timeout-minutes: 5 runs-on: [self-hosted, linux-a100] container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -619,7 +618,7 @@ jobs: needs: [test_tools, test_restful, test_pipeline, test_benchmark] timeout-minutes: 5 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 21d200a405..9243887ecf 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -153,7 +153,7 @@ jobs: python3 -m pip install -r requirements/lite.txt - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm python3 -m pip install -r requirements/test.txt - name: Check env @@ -163,7 +163,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true @@ -226,7 +226,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -235,7 +235,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true @@ -290,7 +290,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -299,7 +299,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api turbomind if: matrix.backend == 'turbomind' @@ -370,7 +370,7 @@ jobs: run: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Get coverage report run: | diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml index 7d9b250385..3a080f2615 100644 --- a/.github/workflows/daily_ete_test_5080.yml +++ b/.github/workflows/daily_ete_test_5080.yml @@ -92,7 +92,7 @@ jobs: download_pkgs: needs: linux-build if: ${{!cancelled()}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] timeout-minutes: 50 container: image: openmmlab/lmdeploy:latest-cu12.8 @@ -129,7 +129,7 @@ jobs: test_quantization: needs: download_pkgs if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] timeout-minutes: 150 env: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA @@ -153,7 +153,7 @@ jobs: python3 -m pip install -r requirements/lite.txt - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm python3 -m pip install -r requirements/test.txt - name: Check env @@ -163,7 +163,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true @@ -188,7 +188,7 @@ jobs: chmod -R 777 $workdir test_tools: if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] needs: test_quantization timeout-minutes: 300 strategy: @@ -225,7 +225,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -234,7 +234,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true @@ -265,7 +265,7 @@ jobs: chmod -R 777 $workdir test_restful: if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] needs: test_quantization strategy: fail-fast: false @@ -289,7 +289,7 @@ jobs: python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -298,7 +298,7 @@ jobs: rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api turbomind if: matrix.backend == 'turbomind' @@ -353,7 +353,7 @@ jobs: chmod -R 777 $workdir get_coverage_report: if: ${{!cancelled() && success()}} - runs-on: [self-hosted, 5090-r1] + runs-on: [self-hosted, 5080-r1] needs: [test_tools, test_restful] timeout-minutes: 5 container: @@ -368,7 +368,7 @@ jobs: run: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Get coverage report run: | diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml new file mode 100644 index 0000000000..1dab90bebf --- /dev/null +++ b/.github/workflows/daily_ete_test_h800.yml @@ -0,0 +1,338 @@ +name: daily_ete_test_h800 + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: llm, vllm. Default contains all models' + type: string + default: "['llm','mllm']" + function: + required: true + description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' + type: string + default: '["pipeline", "restful", "chat"]' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['tools','restful']" + schedule: + - cron: '00 14 * * 0-4' + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }} + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.8 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, h800-r1] + timeout-minutes: 50 + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models + - /nvme2/share:/nvme2/share + - /mnt/137_nvme4:/mnt/137_nvme4 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-h800.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + + test_tools: + if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + runs-on: [self-hosted, h800-r1] + needs: download_pkgs + timeout-minutes: 300 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} + function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind + model: mllm + function: chat + - backend: pytorch + model: mllm + function: chat + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models + - /nvme2/share:/nvme2/share + - /mnt/137_nvme2:/mnt/137_nvme2 + - /mnt/137_nvme3:/mnt/137_nvme3 + - /mnt/137_nvme4:/mnt/137_nvme4 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.function == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.function == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_restful: + if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + runs-on: [self-hosted, h800-r1] + needs: download_pkgs + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ['Intern-S1'] + include: + - tp: 8 + model: Intern-S1 + timeout-minutes: 60 + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models + - /nvme2/share:/nvme2/share + - /mnt/137_nvme2:/mnt/137_nvme2 + - /mnt/137_nvme3:/mnt/137_nvme3 + - /mnt/137_nvme4:/mnt/137_nvme4 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Start restful api + if: matrix.model != 'internlm2_5-20b' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/${{matrix.model}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_start_chat_restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Test lmdeploy - restful api + if: matrix.model == 'Intern-S1' + timeout-minutes: 30 + run: | + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.model}}-${{matrix.backend}}_ ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: matrix.model != 'internlm2_5-20b' + run: | + kill -15 "$restful_pid" + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + chmod -R 777 ${{env.REPORT_DIR}} + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + get_coverage_report: + if: ${{!cancelled() && success()}} + runs-on: [self-hosted, h800-r1] + needs: [test_tools, test_restful] + timeout-minutes: 5 + container: + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme1/qa_test_models:/nvme1/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 47e1929421..be64e8743f 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -136,7 +136,7 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml new file mode 100644 index 0000000000..068d074452 --- /dev/null +++ b/autotest/config-h800.yaml @@ -0,0 +1,123 @@ +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +dst_path: /nvme/qa_test_models/autotest_model +log_path: /nvme/qa_test_models/autotest_model/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: h800 + +tp_config: + Intern-S1: 8 + Qwen3-235B-A22B: 8 + Qwen3-235B-A22B-FP8: 8 + Qwen3-30B-A3B: 2 + Qwen3-32B: 2 + gpt-oss-120b: 2 + gpt-oss-120b-BF16: 4 + gpt-oss-20b-BF16: 2 + +turbomind_chat_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + - openai/gpt-oss-120b + - openai/gpt-oss-20b + +pytorch_chat_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + - unsloth/gpt-oss-120b-BF16 + - unsloth/gpt-oss-20b-BF16 + +turbomind_vl_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +pytorch_vl_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +turbomind_base_model: + - internlm/Intern-S1-mini + - Qwen/Qwen3-4B-FP8 + - openai/gpt-oss-20b + +pytorch_base_model: + - internlm/Intern-S1-mini + - Qwen/Qwen3-4B-FP8 + - unsloth/gpt-oss-20b-BF16 + +turbomind_quatization: + no_awq: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + - openai/gpt-oss-120b + - openai/gpt-oss-20b + gptq: + - empty + no_kvint4: + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + no_kvint8: + - empty + +pytorch_quatization: + awq: + - empty + w8a8: + - empty + no_kvint4: + - Qwen/Qwen3-0.6B-FP8 + - Qwen/Qwen3-1.7B-FP8 + - Qwen/Qwen3-4B-FP8 + - Qwen/Qwen3-8B-FP8 + - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 + - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 + no_kvint8: + - empty diff --git a/autotest/config.yaml b/autotest/config.yaml index fab9a5af89..5844758229 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -33,8 +33,8 @@ tp_config: MiniCPM-V-2_6: 2 gemma-2-27b-it: 2 InternVL2-Llama3-76B-AWQ: 4 - gpt-oss-20b: 2 - gpt-oss-120b: 4 + gpt-oss-20b-bf16: 2 + gpt-oss-120b-bf16: 4 turbomind_chat_model: @@ -139,8 +139,8 @@ pytorch_chat_model: - Qwen/Qwen2.5-VL-32B-Instruct - Qwen/Qwen2-VL-2B-Instruct - Qwen/Qwen2-VL-7B-Instruct - - openai/gpt-oss-20b - - openai/gpt-oss-120b + - lmsys/gpt-oss-20b-bf16 + - lmsys/gpt-oss-120b-bf16 - mistralai/Mistral-7B-Instruct-v0.3 - mistralai/Mixtral-8x7B-Instruct-v0.1 - google/gemma-3-12b-it @@ -368,5 +368,5 @@ benchmark_model: - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-72B-Instruct - deepseek-ai/DeepSeek-V2-Lite-Chat - - openai/gpt-oss-20b - - openai/gpt-oss-120b + - lmsys/gpt-oss-20b-bf16 + - lmsys/gpt-oss-120b-bf16 diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 5dbcb6256a..f6ce5acfdd 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -305,6 +305,7 @@ def test_hf_pytorch_chat_pr(config, model, cli_case_config): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id): os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True' @@ -328,6 +329,7 @@ def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf']) def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -349,6 +351,7 @@ def test_pytorch_chat_with_lora_tp1(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat']) def test_pytorch_chat_with_lora_tp2(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index c04d0e1e26..82c747358b 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -12,8 +12,7 @@ @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -22,8 +21,7 @@ def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, wor cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator}') + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -103,8 +101,7 @@ def test_hf_turbomind_chat_tp8(config, model, communicator, cli_case_config, wor @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_kvint4_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -114,7 +111,7 @@ def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_conf model, 'turbomind', cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator} --quant-policy 4') + extra='--quant-policy 4') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -172,8 +169,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_kvint8_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -183,7 +179,7 @@ def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_conf model, 'turbomind', cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator} --quant-policy 8') + extra='--quant-policy 8') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -261,12 +257,12 @@ def test_hf_turbomind_chat_kvint4_tp8(config, model, communicator, cli_case_conf @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_fallback_backend_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -275,8 +271,7 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator}') + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -288,12 +283,12 @@ def test_hf_turbomind_chat_fallback_backend_tp1(config, model, communicator, cli @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' if 'coder' in model: usercase = 'code_testcase' @@ -303,7 +298,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat model, 'turbomind', cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator} --quant-policy 8') + extra='--quant-policy 8') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -315,6 +310,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli_case_config, worker_id): @@ -337,6 +333,7 @@ def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicator, cli_case_config, worker_id): @@ -361,16 +358,14 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id): +def test_hf_turbomind_base_tp1(config, model, cli_case_config, worker_id): usercase = 'base_testcase' result, chat_log, msg = hf_command_line_test(config, usercase, cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix=get_cuda_prefix_by_workerid(worker_id), - extra=f'--communicator {communicator}') + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -410,6 +405,28 @@ def test_hf_turbomind_base_tp2(config, model, communicator, cli_case_config, wor ]) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config): + usercase = 'chat_testcase' + result, chat_log, msg = hf_command_line_test(config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix='CUDA_VISIBLE_DEVICES=5,6', + extra=f'--communicator {communicator}') + + if chat_log is not None: + allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) + + assert result, msg + + +@pytest.mark.order(10) +@pytest.mark.usefixtures('cli_case_config') +@pytest.mark.hf_turbomind_chat +@pytest.mark.gpu_num_1 +@pytest.mark.pr_test +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3-8B']) +def test_hf_turbomind_chat_pr_gpu1(config, model, cli_case_config): usercase = 'chat_testcase' device_type = os.environ.get('DEVICE', 'cuda') if device_type == 'ascend': @@ -421,9 +438,7 @@ def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config): cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix=f'{env_var}5,6', - extra=f'--communicator {communicator}') - + cuda_prefix=env_var + '5,6') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -434,6 +449,7 @@ def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_turbomind_chat_tp1(config, model, cli_case_config, worker_id): os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True' diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 14285f3c91..eaa7942e5c 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -7,7 +7,7 @@ from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline from lmdeploy.utils import is_bf16_supported -gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10) def _is_bf16_supported_by_device(): diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 5a649a1cca..62662f0b03 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -11,7 +11,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN from lmdeploy.vl.utils import encode_image_base64 -gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10) PIC1 = 'tiger.jpeg' PIC2 = 'human-pose.jpg' diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index b9a6939675..2ddc9240da 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -161,6 +161,7 @@ def test_pipeline_chat_pytorch_pr(config, common_case_config, model, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id): @@ -175,6 +176,7 @@ def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf']) def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, worker_id): @@ -190,6 +192,7 @@ def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat']) def test_pipeline_chat_pytorch_with_lora_tp2(config, common_case_config, model, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index 56caa2e6e7..f7a93db09a 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -12,11 +12,10 @@ @pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id) - run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator}) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {}) @pytest.mark.order(6) @@ -65,14 +64,10 @@ def test_pipeline_chat_tp8(config, common_case_config, model, communicator, work @pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id) - run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { - 'quant_policy': 4, - 'communicator': communicator - }) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 4}) @pytest.mark.order(6) @@ -116,14 +111,10 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato @pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id) - run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {'quant_policy': 8}) @pytest.mark.order(6) @@ -179,21 +170,16 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_chat_test(config, - common_case_config, - model, - 'turbomind', - worker_id, {'communicator': communicator}, - is_smoke=True) + run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {}, is_smoke=True) @pytest.mark.order(6) @@ -201,12 +187,12 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('model', [ 'microsoft/Phi-3-mini-4k-instruct', 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8' ]) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) @@ -214,10 +200,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m common_case_config, model, 'turbomind-kvint', - worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }, + worker_id, {'quant_policy': 8}, is_smoke=True) @@ -226,6 +209,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id): @@ -245,6 +229,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id): @@ -286,6 +271,7 @@ def test_pipeline_chat_pr(config, common_case_config, model, communicator, worke @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pipeline_chat_tp1(config, common_case_config, model, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index 44ded4473f..bcfd071eba 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -15,11 +15,10 @@ @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_tp1(config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id) - run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}) + run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}) @pytest.mark.order(6) @@ -57,14 +56,10 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_kvint4_tp1(config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id) - run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { - 'quant_policy': 4, - 'communicator': communicator - }) + run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4}) @pytest.mark.order(6) @@ -105,14 +100,10 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model')) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id) - run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }) + run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}) @pytest.mark.order(6) @@ -151,33 +142,26 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_tp1(config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True) + run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True) @pytest.mark.order(6) @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['OpenGVLab/InternVL2-4B', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id): +def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_pipeline_vl_chat_test(config, - model, - BACKEND_KVINT, - worker_id, { - 'quant_policy': 8, - 'communicator': communicator - }, - is_smoke=True) + run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}, is_smoke=True) @pytest.mark.order(6) @@ -207,8 +191,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, @pytest.mark.parametrize( 'model', ['liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B', 'OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL3-8B']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_pipeline_pr_test(config, model, communicator, worker_id): +def test_pipeline_pr_test(config, model, worker_id): device_type = os.environ.get('DEVICE', 'cuda') if device_type == 'ascend': env_var = 'ASCEND_RT_VISIBLE_DEVICES' @@ -216,4 +199,4 @@ def test_pipeline_pr_test(config, model, communicator, worker_id): env_var = 'CUDA_VISIBLE_DEVICES' if 'gw' in worker_id: os.environ[f'{env_var}'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5) - run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True) + run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {}, is_smoke=True) diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index 6c48007565..9f6c747edb 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -176,6 +176,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'Qwen/Qwen2.5-7B-Instruct', 'cuda_prefix': None, @@ -194,6 +195,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'meta-llama/Llama-2-7b-chat-hf', 'cuda_prefix': None, @@ -212,6 +214,7 @@ def test_restful_chat_with_lora_tp1(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'baichuan-inc/Baichuan2-13B-Chat', @@ -232,6 +235,7 @@ def test_restful_chat_with_lora_tp2(config, common_case_config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B', @@ -253,6 +257,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', @@ -274,6 +279,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-7b-chat', @@ -301,6 +307,7 @@ def test_restful_chat_tools_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-20b-chat', @@ -322,6 +329,7 @@ def test_restful_chat_tools_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_4 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct', diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index b692bd17b5..daf2664662 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -80,7 +80,7 @@ def test_restful_chat_tp8(config, common_case_config, worker_id): def getKvintModelList(tp_num, quant_policy): model_list = [] - for communicator in get_communicator_list(): + for communicator in get_communicator_list(tp_num): model_list += [{ 'model': item, 'cuda_prefix': None, @@ -180,6 +180,7 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'microsoft/Phi-3-mini-4k-instruct', @@ -200,37 +201,19 @@ def test_restful_chat_kvint8_tp8(config, common_case_config, worker_id): 'model': 'microsoft/Phi-3-mini-4k-instruct', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --communicator native' - }, - { - 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', - 'cuda_prefix': None, - 'tp_num': 1, - 'extra': ' --communicator native' - }, - { - 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8', - 'cuda_prefix': None, - 'tp_num': 1, - 'extra': ' --communicator native' - }, - { - 'model': 'microsoft/Phi-3-mini-4k-instruct', - 'cuda_prefix': None, - 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-4bits', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'microsoft/Phi-3-mini-4k-instruct-inner-w8a8', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, ], indirect=True) @@ -246,6 +229,7 @@ def test_restful_chat_fallback_backend_tp1(config, common_case_config, worker_id @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'google/gemma-2-27b-it', @@ -357,6 +341,7 @@ def test_restful_logprobs(worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [{ 'model': 'Qwen/Qwen2.5-7B-Instruct', 'cuda_prefix': None, @@ -376,6 +361,7 @@ def test_modelscope_restful_chat_tp1(config, common_case_config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B', @@ -397,6 +383,7 @@ def test_restful_chat_reasoning_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', @@ -418,6 +405,7 @@ def test_restful_chat_reasoning_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-7b-chat', @@ -445,6 +433,7 @@ def test_restful_chat_tools_tp1(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'internlm/internlm2_5-20b-chat', @@ -466,6 +455,7 @@ def test_restful_chat_tools_tp2(config, worker_id): @pytest.mark.restful_api @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_4 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct', diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py index 2cfbc00020..a98fbbfd6c 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py @@ -19,7 +19,7 @@ def prepare_environment(request, config, worker_id): def getModelList(tp_num): model_list = [] - for communicator in get_communicator_list(): + for communicator in get_communicator_list(tp_num): model_list += [{ 'model': item, 'cuda_prefix': None, @@ -65,7 +65,7 @@ def test_restful_chat_tp4(config, worker_id): def getKvintModelList(tp_num, quant_policy: int = None): model_list = [] - for communicator in get_communicator_list(): + for communicator in get_communicator_list(tp_num): model_list += [{ 'model': item, 'cuda_prefix': None, @@ -146,6 +146,7 @@ def test_restful_chat_kvint8_tp4(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.other @pytest.mark.parametrize('prepare_environment', [ { 'model': 'OpenGVLab/InternVL2-4B', @@ -171,25 +172,25 @@ def test_restful_chat_kvint8_tp4(config, worker_id): 'model': 'OpenGVLab/InternVL2-4B', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'Qwen/Qwen2.5-VL-7B-Instruct', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'THUDM/glm-4v-9b', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, { 'model': 'THUDM/glm-4v-9b-inner-4bits', 'cuda_prefix': None, 'tp_num': 1, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8' }, ], indirect=True) diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 0df8858b2c..51de106840 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -83,8 +83,8 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: return case_list -def get_communicator_list(): - if _is_bf16_supported_by_device(): +def get_communicator_list(tp_num: int = None): + if tp_num != 1 and _is_bf16_supported_by_device(): return ['native', 'nccl'] return ['nccl'] diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index a1a9a3c512..fabc074d37 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -299,13 +299,14 @@ def internvl_vl_testcase(output_text, f, lang: str = 'en'): assert case_result, f'reason: separate images: panda should in {response}' with allure.step(f'internvl-separate-images2-{lang}'): response = get_response_from_output(output_text, f'internvl-separate-images2-{lang}') - case_result = any(word in response.lower() for word in ['panda', '熊猫', 'same', 'different', 'difference']) + case_result = any(word in response.lower() + for word in ['panda', '熊猫', 'same', 'different', 'difference', 'identical']) f.writelines(f'internvl-separate-images2-{lang} result: {case_result}, reason: panda should in {response} \n') with assume: assert case_result, f'reason: separate images2: panda should in {response}' with allure.step(f'internvl-video-{lang}'): response = get_response_from_output(output_text, f'internvl-video-{lang}') - case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food']) + case_result = any(word in response.lower() for word in ['red panda', 'eat', '熊猫', '竹子', 'food', 'hold']) f.writelines(f'internvl-video-{lang} result: {case_result}, reason: panda should in {response} \n') with assume: assert case_result, f'reason: video: panda should in {response}' diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 5aca937681..88c9468823 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -148,7 +148,7 @@ def run_all_step(config, cases_info, worker_id: str = '', port: int = DEFAULT_PO case_info = cases_info.get(case) - with allure.step(case + ' step2 - restful_test - openai chat'): + with allure.step(case + ' restful_test - openai chat'): restful_result, restful_log, msg = open_chat_test(config, case, case_info, model, http_url, worker_id) allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT) with assume: