thecaptain789
diff --git a/‎examples/ags/benchmark/drop.py‎ ‎examples/aflow/benchmark/drop.py‎examples/ags/benchmark/drop.py renamed to examples/aflow/benchmark/drop.py
Lines changed: 1 addition & 1 deletion b/‎examples/ags/benchmark/drop.py‎ ‎examples/aflow/benchmark/drop.py‎examples/ags/benchmark/drop.py renamed to examples/aflow/benchmark/drop.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/ags/benchmark/gsm8k.py‎ ‎examples/aflow/benchmark/gsm8k.py‎examples/ags/benchmark/gsm8k.py renamed to examples/aflow/benchmark/gsm8k.py
Lines changed: 36 additions & 42 deletions b/‎examples/ags/benchmark/gsm8k.py‎ ‎examples/aflow/benchmark/gsm8k.py‎examples/ags/benchmark/gsm8k.py renamed to examples/aflow/benchmark/gsm8k.py
Lines changed: 36 additions & 42 deletions
diff --git a/‎examples/ags/benchmark/hotpotqa.py‎ ‎examples/aflow/benchmark/hotpotqa.py‎examples/ags/benchmark/hotpotqa.py renamed to examples/aflow/benchmark/hotpotqa.py
Lines changed: 1 addition & 1 deletion b/‎examples/ags/benchmark/hotpotqa.py‎ ‎examples/aflow/benchmark/hotpotqa.py‎examples/ags/benchmark/hotpotqa.py renamed to examples/aflow/benchmark/hotpotqa.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/ags/benchmark/humaneval.py‎ ‎examples/aflow/benchmark/humaneval.py‎examples/ags/benchmark/humaneval.py renamed to examples/aflow/benchmark/humaneval.py
Lines changed: 12 additions & 5 deletions b/‎examples/ags/benchmark/humaneval.py‎ ‎examples/aflow/benchmark/humaneval.py‎examples/ags/benchmark/humaneval.py renamed to examples/aflow/benchmark/humaneval.py
Lines changed: 12 additions & 5 deletions
diff --git a/‎examples/ags/benchmark/math.py‎ ‎examples/aflow/benchmark/math.py‎examples/ags/benchmark/math.py renamed to examples/aflow/benchmark/math.py
Lines changed: 1 addition & 1 deletion b/‎examples/ags/benchmark/math.py‎ ‎examples/aflow/benchmark/math.py‎examples/ags/benchmark/math.py renamed to examples/aflow/benchmark/math.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/ags/benchmark/mbpp.py‎ ‎examples/aflow/benchmark/mbpp.py‎examples/ags/benchmark/mbpp.py renamed to examples/aflow/benchmark/mbpp.py
Lines changed: 10 additions & 9 deletions b/‎examples/ags/benchmark/mbpp.py‎ ‎examples/aflow/benchmark/mbpp.py‎examples/ags/benchmark/mbpp.py renamed to examples/aflow/benchmark/mbpp.py
Lines changed: 10 additions & 9 deletions
diff --git a/‎examples/ags/benchmark/utils.py‎ ‎examples/aflow/benchmark/utils.py‎examples/ags/benchmark/utils.py renamed to examples/aflow/benchmark/utils.py b/‎examples/ags/benchmark/utils.py‎ ‎examples/aflow/benchmark/utils.py‎examples/ags/benchmark/utils.py renamed to examples/aflow/benchmark/utils.py
diff --git a/‎examples/ags/data/drop_test.jsonl‎ ‎examples/aflow/data/drop_test.jsonl‎examples/ags/data/drop_test.jsonl renamed to examples/aflow/data/drop_test.jsonl b/‎examples/ags/data/drop_test.jsonl‎ ‎examples/aflow/data/drop_test.jsonl‎examples/ags/data/drop_test.jsonl renamed to examples/aflow/data/drop_test.jsonl
diff --git a/‎examples/ags/data/drop_validate.jsonl‎ ‎examples/aflow/data/drop_validate.jsonl‎examples/ags/data/drop_validate.jsonl renamed to examples/aflow/data/drop_validate.jsonl b/‎examples/ags/data/drop_validate.jsonl‎ ‎examples/aflow/data/drop_validate.jsonl‎examples/ags/data/drop_validate.jsonl renamed to examples/aflow/data/drop_validate.jsonl
@@ -9,7 +9,7 @@
 from scipy.optimize import linear_sum_assignment
 from tqdm.asyncio import tqdm_asyncio
 
-from examples.ags.benchmark.utils import generate_random_indices
+from examples.aflow.benchmark.utils import generate_random_indices
 
 global cost
 cost = 0
 
@@ -10,12 +10,17 @@
 import pandas as pd
 from typing import Optional, List, Tuple, Callable, Any
 from tqdm.asyncio import tqdm_asyncio
+import os
+import time
+from datetime import datetime
 
-from examples.ags.benchmark.utils import generate_random_indices, log_mismatch
+from examples.aflow.benchmark.utils import generate_random_indices, log_mismatch
 
 def extract_number(text: str) -> Optional[float]:
     """Clean text and extract a single number"""
-    matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text)
+    print(f"text: {text}")
+    matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", str(text))
+    print(f"matches: {matches}")
     if matches:
         last_number = matches[-1].replace(",", "")
         try:
@@ -25,78 +30,77 @@ def extract_number(text: str) -> Optional[float]:
     else:
         return None
 
-def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int:
-    """Loose match score calculation function"""
-    expected_number = extract_number(expected_output)
-    predicted_number = extract_number(prediction)
-
-    if expected_number is None or predicted_number is None:
+def loose_match_score(expected_output: float, prediction: float, tolerance: float = 1e-6) -> int:
+    if prediction is None:
         return 0
-
-    if abs(expected_number - predicted_number) <= tolerance:
+    
+    if abs(expected_output - prediction) <= tolerance:
         return 1
     else:
         return 0
 
-def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
-    """Save results to CSV file"""
+def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float, float]:
     df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
-    average_score = df["score"].mean()
-    total_cost = df["cost"].max()
-    average_cost = total_cost / len(df) if len(df) > 0 else 0
+    avg_score = df["score"].mean()
+    t_cost = df["cost"].max()
+    a_cost = t_cost / len(df) if len(df) > 0 else 0
+
+    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"{avg_score:.5f}_{current_time}.csv"
+    output_file = os.path.join(path, filename)
 
-    output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
     print(f"Results saved to {output_file}")
-    return average_score, average_cost, total_cost
+    return avg_score, a_cost, t_cost
 
 async def evaluate_problem(input: str, graph: Callable, expected_output: str, path: str = None) -> Tuple[str, str, str, int, str]:
-    """Evaluate a single problem"""
-    max_retries = 5
+    max_retries = 10
     retries = 0
+    uni_score = 0
+
     while retries < max_retries:
         try:
             prediction = await graph(input) if graph else None
             cost = prediction[1]
             output = prediction[0]
-            
+
             if output is not None:
                 predicted_number = extract_number(output)
-                expected_output = extract_number(expected_output)
+                expected_number = extract_number(expected_output)
             else:
                 predicted_number = None
+                expected_number = extract_number(expected_output)
 
-            uni_score = loose_match_score(expected_output, predicted_number)
+            print(f"predicted_number: {predicted_number}, expected_number: {expected_number}")
+            uni_score = loose_match_score(expected_number, predicted_number)
 
             if uni_score == 0 and path is not None:
                 log_mismatch(input, expected_output, output, predicted_number, path)
-            else:
-                pass
 
             break
 
         except Exception as e:
             retries += 1
             print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
+            time.sleep(5 * retries)
 
             if retries == max_retries:
                 print("Maximum retries reached. Skipping this sample.")
                 output = str(e)
                 cost = None
-                score = 0
+                uni_score = 0
                 break
 
-    return input, output, expected_output, score, cost
+    return input, output, expected_output, uni_score, cost
 
-async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]:
-    """Evaluate all problems"""
+async def evaluate_all_problems(data: List[dict], graph: Callable, path, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]:
     semaphore = asyncio.Semaphore(max_concurrent_tasks)
 
     async def sem_evaluate(problem):
         async with semaphore:
             input_text = problem["question"]
             expected_output = problem["answer"]
-            return await evaluate_problem(input_text, graph, expected_output)
+            return await evaluate_problem(input_text, graph, expected_output, path)
 
     tasks = [sem_evaluate(problem) for problem in data]
 
@@ -113,38 +117,28 @@ async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
 
 async def load_file_data(file_path: str, specific_indices: List[int] = None) -> List[dict]:
     data = []
-    # 异步读取文件内容
     async with aiofiles.open(file_path, mode="r", encoding='utf-8') as file:
         async for line in file:
             data.append(json.loads(line))
 
-    # 然后在随机选择的样本中基于特定索引列表进行进一步筛选
     if specific_indices is not None:
         filtered_data = [data[i] for i in specific_indices if i < len(data)]
         return filtered_data
 
     return data
 
 async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
-    """GSM8K evaluation main function"""
     data = await load_data(file_path, samples, test=test)
-    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=10)
+    results = await evaluate_all_problems(data, graph, path, max_concurrent_tasks=20)
     average_score, average_cost, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
     return average_score, total_cost
 
 async def optimize_gsm8k_evaluation(graph: Callable, file_path: str, path: str, va_list: list) -> Tuple[Any, Any, Any]:
-    """Optimize GSM8K evaluation main function"""
     data = await load_file_data(file_path, va_list)
-    results = await evaluate_all_problems(data, graph, path, max_concurrent_tasks=50)
+    results = await evaluate_all_problems(data, graph, path, max_concurrent_tasks=8)
     average_score, average_cost, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
-    return average_score, average_cost, total_cost
-
-# TODO Benchmark 与 Evaluator 中主要修改四个地方
-# 1. Evaluator.py 之中添加 val list
-# 2. load_data 函数修改
-# 3. result_to_csv 函数需要给 avg return
-# 4. evaluate_problem 中添加log.json
+    return average_score, average_cost, total_cost
@@ -11,7 +11,7 @@
 import re
 
 
-from examples.ags.benchmark.utils import generate_random_indices
+from examples.aflow.benchmark.utils import generate_random_indices
 
 global cost
 cost = 0
 
@@ -11,8 +11,8 @@
 import pandas as pd
 from tqdm.asyncio import tqdm_asyncio
 
-from examples.ags.benchmark.utils import generate_random_indices
-from examples.ags.benchmark.utils import log_mismatch
+from examples.aflow.benchmark.utils import generate_random_indices
+from examples.aflow.benchmark.utils import log_mismatch
 from metagpt.actions.code_sanitize import sanitize
 
 
@@ -134,7 +134,7 @@ async def evaluate_problem(data: dict, graph: Callable, path) -> Tuple[str, str,
 
     while retries < max_retries:
         try:
-            prediction = await graph(data["prompt"], data["entry_point"]) if graph else "None"
+            prediction = await asyncio.wait_for(graph(data["prompt"], data["entry_point"]), timeout=60) if graph else "None"
             cost = prediction[1]  
             solution = prediction[0]
             ret = check_solution(solution, data["test"], data["entry_point"])
@@ -145,6 +145,13 @@ async def evaluate_problem(data: dict, graph: Callable, path) -> Tuple[str, str,
             if score == 0:
                 log_mismatch(data["prompt"], expected_output, solution, score, path)
             break
+            
+        except TimeoutError:
+            solution = None
+            ret = (FAIL, ["超时"])
+            score = 0
+            cost = 0
+            break
 
         except Exception as e:
             retries += 1
@@ -195,7 +202,7 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int]], path):
 
 async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
     data = await load_data(file_path, samples, test=test)
-    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
     average_score, average_cost, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on HumanEval dataset: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
@@ -205,7 +212,7 @@ async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, pa
 
 async def optimize_humaneval_evaluation(graph: Callable, file_path: str, path: str, va_list: List[int]) -> Tuple[float, float, float]:
     data = await load_file_data(file_path, va_list)
-    results = await evaluate_all_problems(data, graph, path, max_concurrent_tasks=25)
+    results = await evaluate_all_problems(data, graph, path, max_concurrent_tasks=10)
     average_score, average_cost, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on HumanEval dataset: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
 
@@ -12,7 +12,7 @@
 from typing import Optional, List, Tuple, Callable, Union
 from tqdm.asyncio import tqdm_asyncio
 
-from examples.ags.benchmark.utils import generate_random_indices
+from examples.aflow.benchmark.utils import generate_random_indices
 
 def extract_model_answer(text: str) -> str:
     # 提取最后一个 \boxed{...}
 
@@ -9,9 +9,9 @@
 from datetime import datetime
 
 from tqdm.asyncio import tqdm_asyncio
-from examples.ags.benchmark.utils import log_mismatch
+from examples.aflow.benchmark.utils import log_mismatch
 from metagpt.actions.code_sanitize import sanitize
-from examples.ags.benchmark.utils import generate_random_indices
+from examples.aflow.benchmark.utils import generate_random_indices
 
 PASS = "pass"
 FAIL = "fail"
@@ -32,13 +32,13 @@ async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
 class TimeoutError(Exception):
     pass
 
-def run_with_timeout(func, args, timeout):
+def run_with_timeout(func, timeout):
     result = []
     stop_event = threading.Event()
 
     def target():
         try:
-            result.append(func(*args))
+            result.append(func())
         except Exception as e:
             result.append(e)
         finally:
@@ -61,6 +61,7 @@ def target():
 def check_solution(solution, test, entry_point):
 
     solution = sanitize(code=solution, entrypoint=entry_point)
+    print(test)
     try:
         # 定义一个包含所有必要模块的全局字典
         global_dict = {
@@ -87,7 +88,7 @@ def check_solution(solution, test, entry_point):
         check = global_dict["check"]
 
         # 运行检查函数，设置超时时间为120秒
-        result = run_with_timeout(check, (global_dict[entry_point],), 15)
+        result = run_with_timeout(check, 15)
 
         if result is None:
             result = (PASS, "解决方案通过了所有测试用例。")
@@ -110,16 +111,15 @@ async def evaluate_problem(data: dict, graph: Callable, path) -> Tuple[str, str,
     retries = 0
 
     expected_output = "\nCorrect Solution:\ndef " + data["code"]
-    
     while retries < max_retries:
         try:
             prediction = await graph(data["prompt"], data["entry_point"]) if graph else "None"
             cost = prediction[1]
             solution = prediction[0]
-            ret = await check_solution(solution, data["test"], data["entry_point"]) 
+            ret = check_solution(solution, data["test"], data["entry_point"]) 
             test_case_details = ret[1]
-            score = 1 if ret[0] == PASS else 0
-            expected_output = test_case_details + "\nCorrect Solution:" + data["code"]        
+            expected_output = test_case_details + "\nCorrect Solution:" + data["code"]    
+            score = 1 if ret[0] == PASS else 0    
 
             if score == 0:
                 log_mismatch(data["prompt"], expected_output, solution, score, path)
@@ -134,6 +134,7 @@ async def evaluate_problem(data: dict, graph: Callable, path) -> Tuple[str, str,
                 solution = None
                 ret = (FAIL, [])
                 score = 0
+                cost = 0
                 break
 
     return data["prompt"], solution, expected_output, score, cost