[Minor] Add dataset NPMM-bench #1106

Workflow file for this run

.github/workflows/pr-run-test.yml at b490745

	name: pr_run_test

	on:
	pull_request:
	branches:
	- "main"
	paths-ignore:
	- "docs/**"
	- "**.md"
	workflow_dispatch:
	schedule:
	- cron: '56 01 * * *'

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true

	env:
	BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2.5-VL-7B-Instruct":0.76363636,"InternVL3-8B":0.92727273,"llava-onevision-qwen2-0.5b-ov-hf":0.45454545},"MMStar_MINI":{"Qwen2.5-VL-7B-Instruct":0.6133333333333333,"InternVL3-8B":0.7,"llava-onevision-qwen2-0.5b-ov-hf":0.36},"AI2D_MINI":{"Qwen2.5-VL-7B-Instruct":0.7651821862348178,"InternVL3-8B":0.8218623481781376,"llava-onevision-qwen2-0.5b-ov-hf":0.48582995951417},"OCRBench_MINI":{"Qwen2.5-VL-7B-Instruct":15.7,"InternVL3-8B":17.3,"llava-onevision-qwen2-0.5b-ov-hf":5.5}}'
	HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub
	HF_HUB_OFFLINE: 1
	CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
	CONDA_ENV: vlm_pr_test
	KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
	KUBEBRAIN_NAMESPACE: ailab-opencompass

	jobs:
	vlm_test:
	if: ${{!cancelled()}}
	runs-on: [yidian_cu12_mllm]
	strategy:
	fail-fast: false
	matrix:
	dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"]
	model: ['llava-onevision-qwen2-0.5b-ov-hf', 'InternVL3-8B', 'Qwen2.5-VL-7B-Instruct']
	include:
	- model: llava-onevision-qwen2-0.5b-ov-hf
	model_name: llava
	- model: Qwen2.5-VL-7B-Instruct
	model_name: qwen
	- model: InternVL3-8B
	model_name: internvl
	- dataset: MMBench_V11_MINI MMStar_MINI AI2D_MINI
	dataset_name: mmbench
	- dataset: OCRBench_MINI
	dataset_name: ocrbench
	steps:
	- name: Clean workdir
	run: sudo git clean -ffdx
	- name: clone_repo
	uses: actions/checkout@v3
	- name: reinstall vlmeval
	run: \|
	. ${{env.CONDA_PATH}}/bin/activate
	conda activate ${{env.CONDA_ENV}}
	pip uninstall vlmeval -y
	pip install .
	pip install numpy==1.23.0 transformers==4.57.1
	- name: evaluation_model
	run: \|
	. ${{env.CONDA_PATH}}/bin/activate
	conda activate ${{env.CONDA_ENV}}
	pip list

	rjob submit --metadata-name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/LMUData --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{github.workspace}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse --judge exact_matching 2>&1'

	for i in {1..1200}; do
	current_status=$(rjob get vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} \| grep -oP 'rjob [^:]+: \K[^ ]+')
	echo "Current status: $current_status, stop checking"
	if [[ $current_status == "Succeeded" ]]; then
	echo "Task succeeded"
	rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
	exit 0
	elif [[ $current_status == "Failed" \|\| $current_status == "Stopped" ]]; then
	echo "Task failed or stopped, fetching logs"
	rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
	exit 1
	fi
	sleep 6
	done
	rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
	rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
	echo "Task timeout"
	exit 1
	- name: assert_result
	run: \|
	. ${{env.CONDA_PATH}}/bin/activate
	conda activate ${{env.CONDA_ENV}}
	cp -r /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} outputs
	python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name ${{matrix.model}}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Minor] Add dataset NPMM-bench #1106

Workflow file

[Minor] Add dataset NPMM-bench #1106

Uh oh!

Workflow file for this run