Skip to content

Commit c36dbff

Browse files
[Benchmark] Add support for VenusBench-GD (#1449)
* add venusbench * fix lint * fix lint --------- Co-authored-by: Haodong Duan <dhd@pku.edu.cn>
1 parent ef83c66 commit c36dbff

File tree

2 files changed

+186
-1
lines changed

2 files changed

+186
-1
lines changed

vlmeval/dataset/GUI/venusbench.py

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
import os
2+
import re
3+
import ast
4+
5+
import pandas as pd
6+
import numpy as np
7+
8+
from ..image_base import ImageBaseDataset
9+
from ..utils import build_judge
10+
from ...smp import *
11+
from ipdb import set_trace as st
12+
13+
logger = get_logger("RUN")
14+
15+
SYSTEM_PROMPT = "You are a GUI agent. You are given a task and a screenshot of the screen. " \
16+
"You need to perform pyautogui click/moveTo action to complete the task. " \
17+
"The answer format is `pyautogui.click(x=?, y=?), x and y is necessary`"
18+
19+
USER_INSTRUCTION = "Please complete the following tasks by clicking using `pyautogui.click`:\n{instruction}"
20+
21+
22+
def parse_bbox_aguvis(response):
23+
match = re.search(r"x=([\d.]+), y=([\d.]+)", response)
24+
if match:
25+
click_point = [float(match.group(1)), float(match.group(2))]
26+
else:
27+
click_point = [0.0, 0.0]
28+
return click_point
29+
30+
31+
class VenusBench_GD(ImageBaseDataset):
32+
MODALITY = "IMAGE"
33+
TYPE = "GUI"
34+
DATASET_URL = {
35+
"VenusBench-GD": "https://huggingface.co/datasets/Zery/VBGD_Dataset/resolve/main/VenusBench.tsv",
36+
}
37+
DATASET_MD5 = {
38+
'VenusBench-GD': '6a2fe92d3ecf5a3b6503a1fe4891c5ea'
39+
}
40+
41+
def __init__(
42+
self,
43+
dataset="VenusBench-GD",
44+
skip_noimg=True,
45+
skeleton=False,
46+
):
47+
ROOT = LMUDataRoot()
48+
self.dataset_name = dataset
49+
self.img_root = osp.join(ROOT, "images", self.dataset_name)
50+
51+
if skeleton:
52+
return
53+
54+
data = self.load_data(dataset)
55+
self.skip_noimg = skip_noimg
56+
if skip_noimg and "image" in data:
57+
data = data[~pd.isna(data["image"])]
58+
59+
# Verify we have index properly
60+
if "index" not in data:
61+
data["index"] = [str(idx + 1) for idx in range(len(data))]
62+
63+
self.meta_only = True
64+
self.parse_response_func = parse_bbox_aguvis
65+
66+
if "image" in data:
67+
data["image"] = [str(x) for x in data["image"]]
68+
image_map = {x: y for x, y in zip(data["index"], data["image"])}
69+
for k in image_map:
70+
if len(image_map[k]) <= 64:
71+
idx = image_map[k]
72+
assert idx in image_map and len(image_map[idx]) > 64
73+
image_map[k] = image_map[idx]
74+
75+
images = [toliststr(image_map[k]) for k in data["index"]]
76+
data["image"] = [x[0] if len(x) == 1 else x for x in images]
77+
self.meta_only = False
78+
79+
self.data = data
80+
81+
@classmethod
82+
def get_action_space(self):
83+
return ""
84+
85+
@classmethod
86+
def get_trajectory(self, line):
87+
traj_dict = {}
88+
traj_dict["task"] = line["question"]
89+
return traj_dict
90+
91+
def build_prompt(self, line):
92+
if isinstance(line, int):
93+
line = self.data.iloc[line]
94+
tgt_path = self.dump_image(line)
95+
user_instruction = USER_INSTRUCTION.format(instruction=line["question"])
96+
msgs = []
97+
msgs.append(dict(role="system", type="text", value=SYSTEM_PROMPT))
98+
if isinstance(tgt_path, list):
99+
msgs.extend([dict(type="image", value=p) for p in tgt_path])
100+
else:
101+
msgs = [dict(type="image", value=tgt_path)]
102+
msgs.append(dict(type="text", value=user_instruction))
103+
return msgs
104+
105+
def evaluate(self, eval_file, **judge_kwargs):
106+
stats = defaultdict(list)
107+
result = []
108+
109+
data = load(eval_file)
110+
assert "bbox" in data and "prediction" in data
111+
lt = len(data)
112+
lines = [data.iloc[i] for i in range(lt)]
113+
114+
for i in tqdm(range(len(lines))):
115+
line = lines[i]
116+
bbox = (
117+
line["bbox"]
118+
if isinstance(line["bbox"], list)
119+
else ast.literal_eval(line["bbox"])
120+
)
121+
# The format of bbox in VenusBench-GD is (x_min, y_min, x_max, y_max)
122+
image = Image.open(os.path.join(self.img_root, line["image_path"]))
123+
img_size = image.size
124+
125+
# Absolute to relative
126+
bbox = [
127+
bbox[0] / img_size[0],
128+
bbox[1] / img_size[1],
129+
bbox[2] / img_size[0],
130+
bbox[3] / img_size[1],
131+
]
132+
133+
key = line["category"] + ":" + line['ui_type']
134+
prediction = str(line["prediction"])
135+
try:
136+
click_point = self.parse_response_func(prediction)
137+
if click_point[0] > 1 or click_point[1] > 1:
138+
click_point = (click_point[0] / img_size[0], click_point[1] / img_size[1])
139+
140+
match = (bbox[0] <= click_point[0] <= bbox[2]) and \
141+
(bbox[1] <= click_point[1] <= bbox[3])
142+
143+
if match:
144+
stats[key].append(1)
145+
else:
146+
stats[key].append(0)
147+
is_wrong_format = False
148+
except Exception as e:
149+
logger.warning(f"exception in venusbench eval:{e}")
150+
stats[key].append(-1)
151+
match, is_wrong_format, click_point = False, True, None
152+
153+
result.append(
154+
{
155+
"img_path": os.path.join(self.img_root, line["image_path"]),
156+
"text": line["question"],
157+
"bbox": line["bbox"],
158+
"parsed_bbox": bbox,
159+
"type": line["ui_type"],
160+
"category": line["category"],
161+
"match": match,
162+
"is_wrong_format": is_wrong_format,
163+
"pred": click_point,
164+
}
165+
)
166+
167+
final_score_dict = {}
168+
final_score_dict.update({k + ':cnt': len(stats[k]) for k in stats})
169+
170+
full_stats = []
171+
for v in stats.values():
172+
full_stats.extend(v)
173+
final_score_dict['Overall_Accuracy'] = np.mean([x > 0 for x in full_stats]) * 100
174+
final_score_dict['Format_Err_Rate'] = np.mean([x < 0 for x in full_stats]) * 100
175+
176+
cates = list(set([line["category"] for line in lines]))
177+
for c in cates:
178+
sub_stats = [v for k, v in stats.items() if k.split(":")[0] == c for x in v]
179+
if len(sub_stats) > 0:
180+
final_score_dict[c + '_Accuracy'] = np.mean([x[0] > 0 for x in [sub_stats]]) * 100
181+
182+
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
183+
dump(final_score_dict, score_pth)
184+
return final_score_dict

vlmeval/dataset/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
from .GUI.screenspot import ScreenSpot
100100
from .GUI.screenspot_v2 import ScreenSpotV2
101101
from .GUI.screenspot_pro import ScreenSpot_Pro
102+
from .GUI.venusbench import VenusBench_GD
102103
from .mmifeval import MMIFEval
103104
from .chartmimic import ChartMimic
104105
from .m4bench import M4Bench
@@ -258,7 +259,7 @@ def evaluate(self, eval_file, **judge_kwargs):
258259
MMNIAH, CMMMU, VLRewardBench, WeMath, LogicVista, MMMUProDataset,
259260
CreationMMBenchDataset, ImageShortQADataset, MMAlignBench, OmniDocBench,
260261
VLM2Bench, VMCBenchDataset, EMMADataset, MME_CoT, MOAT, MedXpertQA_MM_test,
261-
LEGO, MMSci_Captioning, Physics_yale, ScreenSpot_Pro, ScreenSpot,
262+
LEGO, MMSci_Captioning, Physics_yale, ScreenSpot_Pro, ScreenSpot, VenusBench_GD,
262263
ScreenSpotV2, OSWorld_G, VBGD, MMIFEval, Spatial457, VisuLogic, CVBench, PathVQA_VAL,
263264
PathVQA_TEST, TDBench, TDBenchGrounding, MicroBench, CharXiv, OmniMedVQA,
264265
WildDocBenchmark, MSEarthMCQ, OCR_Reasoning, PhyX, VLMBlind, CountBenchQA,

0 commit comments

Comments
 (0)