1010import pandas as pd
1111from typing import Optional , List , Tuple , Callable , Any
1212from tqdm .asyncio import tqdm_asyncio
13+ import os
14+ import time
15+ from datetime import datetime
1316
14- from examples .ags .benchmark .utils import generate_random_indices , log_mismatch
17+ from examples .aflow .benchmark .utils import generate_random_indices , log_mismatch
1518
1619def extract_number (text : str ) -> Optional [float ]:
1720 """Clean text and extract a single number"""
18- matches = re .findall (r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+" , text )
21+ print (f"text: { text } " )
22+ matches = re .findall (r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+" , str (text ))
23+ print (f"matches: { matches } " )
1924 if matches :
2025 last_number = matches [- 1 ].replace ("," , "" )
2126 try :
@@ -25,78 +30,77 @@ def extract_number(text: str) -> Optional[float]:
2530 else :
2631 return None
2732
28- def loose_match_score (expected_output : str , prediction : str , tolerance : float = 1e-6 ) -> int :
29- """Loose match score calculation function"""
30- expected_number = extract_number (expected_output )
31- predicted_number = extract_number (prediction )
32-
33- if expected_number is None or predicted_number is None :
33+ def loose_match_score (expected_output : float , prediction : float , tolerance : float = 1e-6 ) -> int :
34+ if prediction is None :
3435 return 0
35-
36- if abs (expected_number - predicted_number ) <= tolerance :
36+
37+ if abs (expected_output - prediction ) <= tolerance :
3738 return 1
3839 else :
3940 return 0
4041
41- def save_results_to_csv (results : List [Tuple [str , str , str , int , str ]], path : str ) -> Tuple [float , float ]:
42- """Save results to CSV file"""
42+ def save_results_to_csv (results : List [Tuple [str , str , str , int , str ]], path : str ) -> Tuple [float , float , float ]:
4343 df = pd .DataFrame (results , columns = ["question" , "prediction" , "expected_output" , "score" , "cost" ])
44- average_score = df ["score" ].mean ()
45- total_cost = df ["cost" ].max ()
46- average_cost = total_cost / len (df ) if len (df ) > 0 else 0
44+ avg_score = df ["score" ].mean ()
45+ t_cost = df ["cost" ].max ()
46+ a_cost = t_cost / len (df ) if len (df ) > 0 else 0
47+
48+ current_time = datetime .now ().strftime ("%Y%m%d_%H%M%S" )
49+ filename = f"{ avg_score :.5f} _{ current_time } .csv"
50+ output_file = os .path .join (path , filename )
4751
48- output_file = f"{ path } /{ average_score :.5f} .csv"
4952 df .to_csv (output_file , index = False )
5053 print (f"Results saved to { output_file } " )
51- return average_score , average_cost , total_cost
54+ return avg_score , a_cost , t_cost
5255
5356async def evaluate_problem (input : str , graph : Callable , expected_output : str , path : str = None ) -> Tuple [str , str , str , int , str ]:
54- """Evaluate a single problem"""
55- max_retries = 5
57+ max_retries = 10
5658 retries = 0
59+ uni_score = 0
60+
5761 while retries < max_retries :
5862 try :
5963 prediction = await graph (input ) if graph else None
6064 cost = prediction [1 ]
6165 output = prediction [0 ]
62-
66+
6367 if output is not None :
6468 predicted_number = extract_number (output )
65- expected_output = extract_number (expected_output )
69+ expected_number = extract_number (expected_output )
6670 else :
6771 predicted_number = None
72+ expected_number = extract_number (expected_output )
6873
69- uni_score = loose_match_score (expected_output , predicted_number )
74+ print (f"predicted_number: { predicted_number } , expected_number: { expected_number } " )
75+ uni_score = loose_match_score (expected_number , predicted_number )
7076
7177 if uni_score == 0 and path is not None :
7278 log_mismatch (input , expected_output , output , predicted_number , path )
73- else :
74- pass
7579
7680 break
7781
7882 except Exception as e :
7983 retries += 1
8084 print (f"Error generating prediction: { e } . Retrying... ({ retries } /{ max_retries } )" )
85+ time .sleep (5 * retries )
8186
8287 if retries == max_retries :
8388 print ("Maximum retries reached. Skipping this sample." )
8489 output = str (e )
8590 cost = None
86- score = 0
91+ uni_score = 0
8792 break
8893
89- return input , output , expected_output , score , cost
94+ return input , output , expected_output , uni_score , cost
9095
91- async def evaluate_all_problems (data : List [dict ], graph : Callable , max_concurrent_tasks : int = 20 ) -> List [Tuple [str , str , str , int , str ]]:
92- """Evaluate all problems"""
96+ async def evaluate_all_problems (data : List [dict ], graph : Callable , path , max_concurrent_tasks : int = 20 ) -> List [Tuple [str , str , str , int , str ]]:
9397 semaphore = asyncio .Semaphore (max_concurrent_tasks )
9498
9599 async def sem_evaluate (problem ):
96100 async with semaphore :
97101 input_text = problem ["question" ]
98102 expected_output = problem ["answer" ]
99- return await evaluate_problem (input_text , graph , expected_output )
103+ return await evaluate_problem (input_text , graph , expected_output , path )
100104
101105 tasks = [sem_evaluate (problem ) for problem in data ]
102106
@@ -113,38 +117,28 @@ async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
113117
114118async def load_file_data (file_path : str , specific_indices : List [int ] = None ) -> List [dict ]:
115119 data = []
116- # 异步读取文件内容
117120 async with aiofiles .open (file_path , mode = "r" , encoding = 'utf-8' ) as file :
118121 async for line in file :
119122 data .append (json .loads (line ))
120123
121- # 然后在随机选择的样本中基于特定索引列表进行进一步筛选
122124 if specific_indices is not None :
123125 filtered_data = [data [i ] for i in specific_indices if i < len (data )]
124126 return filtered_data
125127
126128 return data
127129
128130async def gsm8k_evaluation (graph : Callable , file_path : str , samples : int , path : str , test = False ) -> Tuple [float , float ]:
129- """GSM8K evaluation main function"""
130131 data = await load_data (file_path , samples , test = test )
131- results = await evaluate_all_problems (data , graph , max_concurrent_tasks = 10 )
132+ results = await evaluate_all_problems (data , graph , path , max_concurrent_tasks = 20 )
132133 average_score , average_cost , total_cost = save_results_to_csv (results , path = path )
133134 print (f"Average score: { average_score :.5f} " )
134135 print (f"Total Cost: { total_cost :.5f} " )
135136 return average_score , total_cost
136137
137138async def optimize_gsm8k_evaluation (graph : Callable , file_path : str , path : str , va_list : list ) -> Tuple [Any , Any , Any ]:
138- """Optimize GSM8K evaluation main function"""
139139 data = await load_file_data (file_path , va_list )
140- results = await evaluate_all_problems (data , graph , path , max_concurrent_tasks = 50 )
140+ results = await evaluate_all_problems (data , graph , path , max_concurrent_tasks = 8 )
141141 average_score , average_cost , total_cost = save_results_to_csv (results , path = path )
142142 print (f"Average score: { average_score :.5f} " )
143143 print (f"Total Cost: { total_cost :.5f} " )
144- return average_score , average_cost , total_cost
145-
146- # TODO Benchmark 与 Evaluator 中主要修改四个地方
147- # 1. Evaluator.py 之中添加 val list
148- # 2. load_data 函数修改
149- # 3. result_to_csv 函数需要给 avg return
150- # 4. evaluate_problem 中添加log.json
144+ return average_score , average_cost , total_cost
0 commit comments