|
| 1 | +import os |
| 2 | +import re |
| 3 | +import ast |
| 4 | + |
| 5 | +import pandas as pd |
| 6 | +import numpy as np |
| 7 | + |
| 8 | +from ..image_base import ImageBaseDataset |
| 9 | +from ..utils import build_judge |
| 10 | +from ...smp import * |
| 11 | +from ipdb import set_trace as st |
| 12 | + |
| 13 | +logger = get_logger("RUN") |
| 14 | + |
| 15 | +SYSTEM_PROMPT = "You are a GUI agent. You are given a task and a screenshot of the screen. " \ |
| 16 | + "You need to perform pyautogui click/moveTo action to complete the task. " \ |
| 17 | + "The answer format is `pyautogui.click(x=?, y=?), x and y is necessary`" |
| 18 | + |
| 19 | +USER_INSTRUCTION = "Please complete the following tasks by clicking using `pyautogui.click`:\n{instruction}" |
| 20 | + |
| 21 | + |
| 22 | +def parse_bbox_aguvis(response): |
| 23 | + match = re.search(r"x=([\d.]+), y=([\d.]+)", response) |
| 24 | + if match: |
| 25 | + click_point = [float(match.group(1)), float(match.group(2))] |
| 26 | + else: |
| 27 | + click_point = [0.0, 0.0] |
| 28 | + return click_point |
| 29 | + |
| 30 | + |
| 31 | +class VenusBench_GD(ImageBaseDataset): |
| 32 | + MODALITY = "IMAGE" |
| 33 | + TYPE = "GUI" |
| 34 | + DATASET_URL = { |
| 35 | + "VenusBench-GD": "https://huggingface.co/datasets/Zery/VBGD_Dataset/resolve/main/VenusBench.tsv", |
| 36 | + } |
| 37 | + DATASET_MD5 = { |
| 38 | + 'VenusBench-GD': '6a2fe92d3ecf5a3b6503a1fe4891c5ea' |
| 39 | + } |
| 40 | + |
| 41 | + def __init__( |
| 42 | + self, |
| 43 | + dataset="VenusBench-GD", |
| 44 | + skip_noimg=True, |
| 45 | + skeleton=False, |
| 46 | + ): |
| 47 | + ROOT = LMUDataRoot() |
| 48 | + self.dataset_name = dataset |
| 49 | + self.img_root = osp.join(ROOT, "images", self.dataset_name) |
| 50 | + |
| 51 | + if skeleton: |
| 52 | + return |
| 53 | + |
| 54 | + data = self.load_data(dataset) |
| 55 | + self.skip_noimg = skip_noimg |
| 56 | + if skip_noimg and "image" in data: |
| 57 | + data = data[~pd.isna(data["image"])] |
| 58 | + |
| 59 | + # Verify we have index properly |
| 60 | + if "index" not in data: |
| 61 | + data["index"] = [str(idx + 1) for idx in range(len(data))] |
| 62 | + |
| 63 | + self.meta_only = True |
| 64 | + self.parse_response_func = parse_bbox_aguvis |
| 65 | + |
| 66 | + if "image" in data: |
| 67 | + data["image"] = [str(x) for x in data["image"]] |
| 68 | + image_map = {x: y for x, y in zip(data["index"], data["image"])} |
| 69 | + for k in image_map: |
| 70 | + if len(image_map[k]) <= 64: |
| 71 | + idx = image_map[k] |
| 72 | + assert idx in image_map and len(image_map[idx]) > 64 |
| 73 | + image_map[k] = image_map[idx] |
| 74 | + |
| 75 | + images = [toliststr(image_map[k]) for k in data["index"]] |
| 76 | + data["image"] = [x[0] if len(x) == 1 else x for x in images] |
| 77 | + self.meta_only = False |
| 78 | + |
| 79 | + self.data = data |
| 80 | + |
| 81 | + @classmethod |
| 82 | + def get_action_space(self): |
| 83 | + return "" |
| 84 | + |
| 85 | + @classmethod |
| 86 | + def get_trajectory(self, line): |
| 87 | + traj_dict = {} |
| 88 | + traj_dict["task"] = line["question"] |
| 89 | + return traj_dict |
| 90 | + |
| 91 | + def build_prompt(self, line): |
| 92 | + if isinstance(line, int): |
| 93 | + line = self.data.iloc[line] |
| 94 | + tgt_path = self.dump_image(line) |
| 95 | + user_instruction = USER_INSTRUCTION.format(instruction=line["question"]) |
| 96 | + msgs = [] |
| 97 | + msgs.append(dict(role="system", type="text", value=SYSTEM_PROMPT)) |
| 98 | + if isinstance(tgt_path, list): |
| 99 | + msgs.extend([dict(type="image", value=p) for p in tgt_path]) |
| 100 | + else: |
| 101 | + msgs = [dict(type="image", value=tgt_path)] |
| 102 | + msgs.append(dict(type="text", value=user_instruction)) |
| 103 | + return msgs |
| 104 | + |
| 105 | + def evaluate(self, eval_file, **judge_kwargs): |
| 106 | + stats = defaultdict(list) |
| 107 | + result = [] |
| 108 | + |
| 109 | + data = load(eval_file) |
| 110 | + assert "bbox" in data and "prediction" in data |
| 111 | + lt = len(data) |
| 112 | + lines = [data.iloc[i] for i in range(lt)] |
| 113 | + |
| 114 | + for i in tqdm(range(len(lines))): |
| 115 | + line = lines[i] |
| 116 | + bbox = ( |
| 117 | + line["bbox"] |
| 118 | + if isinstance(line["bbox"], list) |
| 119 | + else ast.literal_eval(line["bbox"]) |
| 120 | + ) |
| 121 | + # The format of bbox in VenusBench-GD is (x_min, y_min, x_max, y_max) |
| 122 | + image = Image.open(os.path.join(self.img_root, line["image_path"])) |
| 123 | + img_size = image.size |
| 124 | + |
| 125 | + # Absolute to relative |
| 126 | + bbox = [ |
| 127 | + bbox[0] / img_size[0], |
| 128 | + bbox[1] / img_size[1], |
| 129 | + bbox[2] / img_size[0], |
| 130 | + bbox[3] / img_size[1], |
| 131 | + ] |
| 132 | + |
| 133 | + key = line["category"] + ":" + line['ui_type'] |
| 134 | + prediction = str(line["prediction"]) |
| 135 | + try: |
| 136 | + click_point = self.parse_response_func(prediction) |
| 137 | + if click_point[0] > 1 or click_point[1] > 1: |
| 138 | + click_point = (click_point[0] / img_size[0], click_point[1] / img_size[1]) |
| 139 | + |
| 140 | + match = (bbox[0] <= click_point[0] <= bbox[2]) and \ |
| 141 | + (bbox[1] <= click_point[1] <= bbox[3]) |
| 142 | + |
| 143 | + if match: |
| 144 | + stats[key].append(1) |
| 145 | + else: |
| 146 | + stats[key].append(0) |
| 147 | + is_wrong_format = False |
| 148 | + except Exception as e: |
| 149 | + logger.warning(f"exception in venusbench eval:{e}") |
| 150 | + stats[key].append(-1) |
| 151 | + match, is_wrong_format, click_point = False, True, None |
| 152 | + |
| 153 | + result.append( |
| 154 | + { |
| 155 | + "img_path": os.path.join(self.img_root, line["image_path"]), |
| 156 | + "text": line["question"], |
| 157 | + "bbox": line["bbox"], |
| 158 | + "parsed_bbox": bbox, |
| 159 | + "type": line["ui_type"], |
| 160 | + "category": line["category"], |
| 161 | + "match": match, |
| 162 | + "is_wrong_format": is_wrong_format, |
| 163 | + "pred": click_point, |
| 164 | + } |
| 165 | + ) |
| 166 | + |
| 167 | + final_score_dict = {} |
| 168 | + final_score_dict.update({k + ':cnt': len(stats[k]) for k in stats}) |
| 169 | + |
| 170 | + full_stats = [] |
| 171 | + for v in stats.values(): |
| 172 | + full_stats.extend(v) |
| 173 | + final_score_dict['Overall_Accuracy'] = np.mean([x > 0 for x in full_stats]) * 100 |
| 174 | + final_score_dict['Format_Err_Rate'] = np.mean([x < 0 for x in full_stats]) * 100 |
| 175 | + |
| 176 | + cates = list(set([line["category"] for line in lines])) |
| 177 | + for c in cates: |
| 178 | + sub_stats = [v for k, v in stats.items() if k.split(":")[0] == c for x in v] |
| 179 | + if len(sub_stats) > 0: |
| 180 | + final_score_dict[c + '_Accuracy'] = np.mean([x[0] > 0 for x in [sub_stats]]) * 100 |
| 181 | + |
| 182 | + score_pth = get_intermediate_file_path(eval_file, '_score', 'json') |
| 183 | + dump(final_score_dict, score_pth) |
| 184 | + return final_score_dict |
0 commit comments