Skip to content

Commit 1491407

Browse files
author
alan.cl
committed
feat(benchmark): update benchmark task use latest falcon github repo questions
1 parent 3c7cfba commit 1491407

File tree

9 files changed

+1372
-633
lines changed

9 files changed

+1372
-633
lines changed

packages/dbgpt-core/src/dbgpt/util/benchmarks/StorageUtil.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ class StorageUtil:
1010

1111
YUQUE_URL_PREFIX = "https://yuque.com"
1212

13+
GITHUB_FALCON_PREFIX = "https://github.com/eosphoros-ai/Falcon"
14+
1315
@staticmethod
1416
def get_file_parse_type(file_path: Optional[str]) -> FileParseTypeEnum:
1517
"""Get file parsing type based on file path
@@ -28,5 +30,7 @@ def get_file_parse_type(file_path: Optional[str]) -> FileParseTypeEnum:
2830

2931
if file_path.strip().startswith(StorageUtil.YUQUE_URL_PREFIX):
3032
return FileParseTypeEnum.YU_QUE
33+
if file_path.strip().startswith(StorageUtil.GITHUB_FALCON_PREFIX):
34+
return FileParseTypeEnum.GITHUB
3135

3236
return FileParseTypeEnum.EXCEL
Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,5 @@
11
from .benchmark_service import BenchmarkService
2-
from .data_compare_service import DataCompareService
3-
from .file_parse_service import FileParseService
4-
from .user_input_execute_service import UserInputExecuteService
52

63
__all__ = [
74
"BenchmarkService",
8-
"FileParseService",
9-
"UserInputExecuteService",
10-
"DataCompareService",
115
]

packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from dbgpt.model.cluster import WorkerManagerFactory
1919
from dbgpt.storage.metadata import BaseDao
2020
from dbgpt.util import PaginationResult, get_or_create_event_loop
21+
from dbgpt.util.benchmarks import StorageUtil
2122
from dbgpt_serve.evaluate.service.benchmark.task.benchmark_agent_task import (
2223
BenchmarkAgentTask,
2324
)
@@ -40,7 +41,6 @@
4041
from ...models.models import ServeDao, ServeEntity
4142
from ..fetchdata.benchmark_data_manager import get_benchmark_manager
4243
from .data_compare_service import DataCompareService
43-
from .ext.excel_file_parse import ExcelFileParseService
4444
from .models import (
4545
BaseInputModel,
4646
BenchmarkDataSets,
@@ -49,7 +49,6 @@
4949
BenchmarkModeTypeEnum,
5050
BenchmarkTaskResult,
5151
ContentTypeEnum,
52-
FileParseTypeEnum,
5352
InputType,
5453
OutputType,
5554
)
@@ -61,10 +60,7 @@
6160

6261
BENCHMARK_SERVICE_COMPONENT_NAME = "dbgpt_serve_evaluate_benchmark_service"
6362

64-
STANDARD_BENCHMARK_FILE_PATH = os.path.join(
65-
BENCHMARK_DATA_ROOT_PATH,
66-
"2025_07_27_public_500_standard_benchmark_question_list.xlsx",
67-
)
63+
STANDARD_BENCHMARK_FILE_PATH = "https://github.com/eosphoros-ai/Falcon"
6864

6965
BENCHMARK_OUTPUT_RESULT_PATH = os.path.join(BENCHMARK_DATA_ROOT_PATH, "result")
7066

@@ -94,11 +90,10 @@ def __init__(
9490
super().__init__(system_app)
9591
self.rag_service = get_rag_service(system_app)
9692
self.prompt_service = get_prompt_service(system_app)
97-
self._file_parse_type = FileParseTypeEnum.EXCEL
93+
self._file_parse_type = StorageUtil.get_file_parse_type(STANDARD_BENCHMARK_FILE_PATH)
9894

99-
fps = ExcelFileParseService()
10095
dcs = DataCompareService()
101-
self.user_input_execute_service = UserInputExecuteService(fps, dcs)
96+
self.user_input_execute_service = UserInputExecuteService(dcs, self._file_parse_type)
10297

10398
self.trigger_executor = ThreadPoolExecutor(
10499
max_workers=5, thread_name_prefix="benchmark-fileWrite"
@@ -289,7 +284,7 @@ async def run_dataset_benchmark(
289284
await manager.load_data()
290285
logger.info(
291286
f"Benchmark dataset loaded from {manager._config.repo_url} "
292-
f"dir={manager._config.data_dir}"
287+
f"dir={manager._config.data_dirs}"
293288
)
294289
except Exception as e:
295290
logger.error(

packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/ext/__init__.py

Whitespace-only changes.
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
import json
2+
import logging
3+
from pathlib import Path
4+
from typing import Any, Dict, List, Optional
5+
6+
import pandas as pd
7+
from openpyxl import Workbook, load_workbook
8+
9+
from dbgpt.util.benchmarks.ExcelUtils import ExcelUtils
10+
11+
from ..file_parse_service import FileParseService
12+
from ..models import (
13+
AnswerExecuteModel,
14+
BaseInputModel,
15+
BenchmarkDataSets,
16+
DataCompareStrategyConfig,
17+
)
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
class ExcelFileParseService(FileParseService):
23+
def parse_input_sets(self, path: str) -> BenchmarkDataSets:
24+
"""
25+
Parse input sets from excel file
26+
Args:
27+
path: File location path
28+
Returns:
29+
BenchmarkDataSets: Parsed data sets
30+
"""
31+
input_stream = self.get_input_stream(path)
32+
33+
if input_stream is None:
34+
raise RuntimeError(f"file not found! path: {path}")
35+
36+
# Parse excel file to get data sets
37+
input_sets = BenchmarkDataSets()
38+
workbook = None
39+
40+
try:
41+
workbook = load_workbook(input_stream, data_only=True)
42+
input_list = []
43+
44+
# Get the first worksheet
45+
sheet = workbook.worksheets[0]
46+
47+
for row_num in range(
48+
2, sheet.max_row + 1
49+
): # Skip header row (start from 1-based index)
50+
row = sheet[row_num]
51+
if ExcelUtils.is_row_empty(row):
52+
continue
53+
54+
# Get content from columns 1-6 (0-based index 0-5)
55+
serial_no_cell = row[0]
56+
analysis_model_id_cell = row[1]
57+
question_cell = row[2]
58+
self_define_tags_cell = row[3]
59+
knowledge_cell = row[4]
60+
llm_output_cell = row[5]
61+
prompt_cell = row[8]
62+
63+
# Build input model
64+
input_model = BaseInputModel(
65+
serial_no=int(
66+
ExcelUtils.get_cell_value_as_string(serial_no_cell) or "0"
67+
),
68+
analysis_model_id=ExcelUtils.get_cell_value_as_string(
69+
analysis_model_id_cell
70+
),
71+
question=ExcelUtils.get_cell_value_as_string(question_cell),
72+
self_define_tags=ExcelUtils.get_cell_value_as_string(
73+
self_define_tags_cell
74+
),
75+
llm_output=ExcelUtils.get_cell_value_as_string(llm_output_cell),
76+
knowledge=ExcelUtils.get_cell_value_as_string(knowledge_cell),
77+
prompt=ExcelUtils.get_cell_value_as_string(prompt_cell),
78+
)
79+
80+
input_list.append(input_model)
81+
82+
input_sets.data_list = input_list
83+
except Exception as e:
84+
logger.error(f"parse excel error, path: {path}, errorMsg: {e}")
85+
finally:
86+
try:
87+
if workbook is not None:
88+
workbook.close()
89+
except Exception as e:
90+
logger.error(f"close workbook error, path: {path}, errorMsg: {e}")
91+
92+
return input_sets
93+
94+
def parse_standard_benchmark_sets(
95+
self, standard_excel_path: str
96+
) -> List[AnswerExecuteModel]:
97+
df = pd.read_excel(standard_excel_path, sheet_name=0)
98+
outputs: List[AnswerExecuteModel] = []
99+
for _, row in df.iterrows():
100+
try:
101+
serial_no = int(row["编号"])
102+
except Exception:
103+
continue
104+
question = row.get("用户问题")
105+
analysis_model_id = row.get("数据集ID")
106+
llm_output = (
107+
None if pd.isna(row.get("标准答案SQL")) else str(row.get("标准答案SQL"))
108+
)
109+
order_by = True
110+
if not pd.isna(row.get("是否排序")):
111+
try:
112+
order_by = bool(int(row.get("是否排序")))
113+
except Exception:
114+
order_by = True
115+
116+
std_result: Optional[List[Dict[str, List[str]]]] = None
117+
if not pd.isna(row.get("标准结果")):
118+
std_result_raw = str(row.get("标准结果"))
119+
std_result = self._parse_multi_standard_result(std_result_raw)
120+
121+
strategy_config = DataCompareStrategyConfig(
122+
strategy="CONTAIN_MATCH",
123+
order_by=order_by,
124+
standard_result=std_result if std_result is not None else None,
125+
)
126+
outputs.append(
127+
AnswerExecuteModel(
128+
serialNo=serial_no,
129+
analysisModelId=analysis_model_id,
130+
question=question,
131+
llmOutput=llm_output,
132+
executeResult=std_result,
133+
strategyConfig=strategy_config,
134+
)
135+
)
136+
return outputs
137+
138+
def _parse_multi_standard_result(
139+
self, std_result_raw: str
140+
) -> Optional[List[Dict[str, List[str]]]]:
141+
"""
142+
Parse multiple standard results from raw string data.
143+
144+
Handles multiple results separated by newlines and parses each line as a dict.
145+
146+
Args:
147+
std_result_raw (str): Raw standard result string with multiple lines
148+
149+
Returns:
150+
Optional[List[Dict[str, List[str]]]]: List of parsed dictionaries,
151+
or None if parsing fails or no valid data
152+
"""
153+
try:
154+
std_result_raw = std_result_raw.strip()
155+
if not std_result_raw:
156+
return None
157+
158+
# 处理多个结果,通过换行符分隔
159+
result_lines = std_result_raw.split("\n")
160+
result_list = []
161+
162+
for line in result_lines:
163+
line = line.strip()
164+
if line:
165+
try:
166+
result_list.append(json.loads(line))
167+
except Exception as e:
168+
logger.warning(
169+
f"Failed to parse line as JSON: {line}, error: {e}"
170+
)
171+
continue
172+
173+
return result_list if result_list else None
174+
except Exception as e:
175+
logger.error(f"parse multiple standard results error: {e}")
176+
return None

0 commit comments

Comments
 (0)