Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,8 @@ def main():
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['AyaVisionBench'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1'
elif listinstr(['MathCanvas'], dataset_name):
judge_kwargs['model'] = 'gpt-4.1-2025-04-14'

if args.use_verifier:
judge_kwargs['use_verifier'] = True
Expand Down
4 changes: 2 additions & 2 deletions vlmeval/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, LLaVABench_KO, VGRPBench, MMVet, MTVQADataset,
TableVQABench, CustomVQADataset, CRPE, MathVerse, OlympiadBench, SeePhys, QSpatial, VizWiz, MMNIAH, LogicVista,
MME_CoT, MMSci_Captioning, Physics_yale, TDBenchGrounding, WildDocBenchmark, OCR_Reasoning, PhyX, CountBenchQA,
ZEROBench, Omni3DBench, TallyQA, MMEReasoning, MMVMBench, BMMR, OCRBench_v2, AyaVisionBench
ZEROBench, Omni3DBench, TallyQA, MMEReasoning, MMVMBench, BMMR, OCRBench_v2, AyaVisionBench, MathCanvas
)

from .image_ccocr import CCOCRDataset
Expand Down Expand Up @@ -216,7 +216,7 @@ def evaluate(self, eval_file, **judge_kwargs):
ZEROBench, SCAM, Omni3DBench, TallyQA, _3DSRBench, BMMR, AffordanceDataset,
MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench,
OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench,
AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset,
AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset, MathCanvas,
MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus,
olmOCRBench, OceanOCRBench, MATBench
]
Expand Down
100 changes: 100 additions & 0 deletions vlmeval/dataset/image_vqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -3626,3 +3626,103 @@ def evaluate(self, eval_file, **judge_kwargs):
result_file = get_intermediate_file_path(eval_file, '_acc')
dump(ret, result_file)
return ret


class MathCanvas(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
"MathCanvas-Bench":
"https://huggingface.co/datasets/shiwk24/MathCanvas-Bench/resolve/main/MathCanvas_Bench_VLMEvalKit.tsv"
}
DATASET_MD5 = {
"MathCanvas-Bench": "9fd0b783ca416dbb20ecfb04d2711411"
}

HINT = (
"Your task is to answer the question above. "
"Give step by step reasoning, and conclude all the answers "
"(include sub-questions) at the end of your solution."
)

def __init__(self, dataset='MathCanvas-Bench', skip_noimg=False):
ROOT = LMUDataRoot()
# You can override this variable to save image files to a different directory
self.dataset_name = dataset
self.img_root = osp.join(ROOT, 'images', dataset)

data = self.load_data(dataset)
self.skip_noimg = skip_noimg

data['index'] = [str(x) for x in data['index']]
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]

if np.all([istype(x, int) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]

self.data = data
self.post_build(dataset)

def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]

tgt_path = self.dump_image(line)
question_text = line['question']

pattern = r'(<image>)'
tokens = re.split(pattern, question_text)

num_placeholders = tokens.count('<image>')
num_images = len(tgt_path)
assert num_placeholders == num_images, (
f"Mismatch between image placeholders ({num_placeholders}) and "
f"image count ({num_images}) for index {line.get('index', 'N/A')}"
)

msgs = []
img_idx = 0
for token in tokens:
if token == '<image>':
msgs.append({'type': 'image', 'value': tgt_path[img_idx]})
img_idx += 1
elif token.strip():
msgs.append({'type': 'text', 'value': token})

msgs.append({'type': 'text', 'value': self.HINT})
return msgs

def evaluate(self, eval_file, **judge_kwargs):
from .utils.mathcanvas import evaluate_with_judge, summarize_mathcanvas_results

judge_kwargs.update({
"max_tokens": 2048,
"temperature": 0.0,
})

config = {'hint': self.HINT, 'judge_kwargs': judge_kwargs}
config_file = get_intermediate_file_path(eval_file, '_config')
with open(config_file, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=4)

detailed_results_file = get_intermediate_file_path(eval_file, '_meta')
if not os.path.exists(detailed_results_file):
print("Evaluating with judge, this may take a while...")
eval_results_list = evaluate_with_judge(eval_file, self.data, **judge_kwargs)
with open(detailed_results_file, 'w', encoding='utf-8') as f:
json.dump(eval_results_list, f, ensure_ascii=False, indent=4)
else:
print(f"Loading existing evaluation results from {detailed_results_file}")
eval_results_list = load(detailed_results_file)

summary_dict = summarize_mathcanvas_results(eval_results_list)

os.environ['EVAL_FORMAT'] = 'json'

score_file = get_intermediate_file_path(eval_file, '_metrics')
with open(score_file, 'w', encoding='utf-8') as f:
json.dump(summary_dict, f, ensure_ascii=False, indent=4)

return summary_dict
Loading