Skip to content

[Minor] Add dataset NPMM-bench #1093

[Minor] Add dataset NPMM-bench

[Minor] Add dataset NPMM-bench #1093

Workflow file for this run

name: pr_run_test
on:
pull_request:
branches:
- "main"
paths-ignore:
- "docs/**"
- "**.md"
workflow_dispatch:
schedule:
- cron: '56 01 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2.5-VL-7B-Instruct":0.76363636,"InternVL3-8B":0.92727273,"llava-onevision-qwen2-0.5b-ov-hf":0.45454545},"MMStar_MINI":{"Qwen2.5-VL-7B-Instruct":0.6133333333333333,"InternVL3-8B":0.7,"llava-onevision-qwen2-0.5b-ov-hf":0.36},"AI2D_MINI":{"Qwen2.5-VL-7B-Instruct":0.7651821862348178,"InternVL3-8B":0.8218623481781376,"llava-onevision-qwen2-0.5b-ov-hf":0.48582995951417},"OCRBench_MINI":{"Qwen2.5-VL-7B-Instruct":15.7,"InternVL3-8B":17.3,"llava-onevision-qwen2-0.5b-ov-hf":5.5}}'
HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub
HF_HUB_OFFLINE: 1
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
CONDA_ENV: vlm_pr_test
KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
KUBEBRAIN_NAMESPACE: ailab-opencompass
jobs:
vlm_test:
if: ${{!cancelled()}}
runs-on: [yidian_cu12_mllm]
strategy:
fail-fast: false
matrix:
dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"]
model: ['llava-onevision-qwen2-0.5b-ov-hf', 'InternVL3-8B', 'Qwen2.5-VL-7B-Instruct']
include:
- model: llava-onevision-qwen2-0.5b-ov-hf
model_name: llava
- model: Qwen2.5-VL-7B-Instruct
model_name: qwen
- model: InternVL3-8B
model_name: internvl
- dataset: MMBench_V11_MINI MMStar_MINI AI2D_MINI
dataset_name: mmbench
- dataset: OCRBench_MINI
dataset_name: ocrbench
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: clone_repo
uses: actions/checkout@v3
- name: reinstall vlmeval
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip uninstall vlmeval -y
pip install .
pip install numpy==1.23.0 transformers==4.57.1
- name: evaluation_model
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip list
rjob submit --metadata-name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/LMUData --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{github.workspace}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse --judge exact_matching 2>&1'
for i in {1..1200}; do
current_status=$(rjob get vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} | grep -oP 'rjob [^:]+: \K[^ ]+')
echo "Current status: $current_status, stop checking"
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
exit 1
fi
sleep 6
done
rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
echo "Task timeout"
exit 1
- name: assert_result
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
cp -r /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} outputs
python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name ${{matrix.model}}