Skip to content

Commit 450571e

Browse files
XianBWTPLin22WinstonLiyt
authored
fix: add metric name check for valid scores (microsoft#724)
* update metric_name * fix some bugs * add an evaluation in workflow * add an evalution in runner * fix ci * test change * fix CI --------- Co-authored-by: TPLin22 <tplin2@163.com> Co-authored-by: yuanteli <1957922024@qq.com>
1 parent 7fc9e18 commit 450571e

File tree

10 files changed

+45
-23
lines changed

10 files changed

+45
-23
lines changed

rdagent/components/coder/data_science/ensemble/eval.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ def evaluate(
3131
) -> EnsembleEvalFeedback:
3232

3333
target_task_information = target_task.get_task_information()
34+
metric_name = self.scen.metric_name
35+
3436
if (
3537
queried_knowledge is not None
3638
and target_task_information in queried_knowledge.success_task_to_knowledge_dict
@@ -55,7 +57,8 @@ def evaluate(
5557
.render(
5658
model_names=[
5759
fn[:-3] for fn in implementation.file_dict.keys() if fn.startswith("model_") and "test" not in fn
58-
]
60+
],
61+
metric_name=metric_name,
5962
)
6063
)
6164

@@ -73,6 +76,7 @@ def evaluate(
7376
system_prompt = T(".prompts:ensemble_eval.system").r(
7477
task_desc=target_task_information,
7578
test_code=test_code,
79+
metric_name=metric_name,
7680
code=implementation.file_dict["ensemble.py"],
7781
workflow_stdout=workflow_stdout,
7882
workflow_code=implementation.all_codes,

rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ assert model_set_in_scores == set({{model_names}}).union({"ensemble"}), (
123123
f"The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {{model_names}} + ['ensemble']\nscore_df is:\n{score_df}"
124124
)
125125
assert score_df.index.is_unique, "The scores dataframe has duplicate model names."
126-
assert len(score_df.columns) == 1, f"The scores dataframe should have exactly one column for the scores of the evaluation indicator, but has these columns: {score_df.columns.tolist()}"
126+
assert score_df.columns.tolist() == ["{{metric_name}}"], f"The column names of the scores dataframe should be ['{{metric_name}}'], but is '{score_df.columns.tolist()}'"
127+
127128

128129
print("Ensemble test end.")

rdagent/components/coder/data_science/ensemble/prompts.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,13 @@ ensemble_eval:
9696
You should evaluate both the ensemble test results and the overall workflow results. **Approve the code only if both tests pass.**
9797
{% endif %}
9898
99+
The metric used for scoring the predictions:
100+
**{{ metric_name }}**
101+
99102
## Evaluation Criteria
100103
- You will be given the standard output (`stdout`) from the ensemble test and, if applicable, the workflow test.
101104
- Code should have no try-except blocks because they can hide errors.
105+
- Check whether the code implement the scoring process using the given metric.
102106
- The stdout includes the local variable values from the ensemble code execution. Check whether the validation score is calculated correctly.
103107
104108
Please respond with your feedback in the following JSON format and order

rdagent/components/coder/data_science/ensemble/test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def develop_one_competition(competition: str):
4444
""",
4545
)
4646

47-
exp = EnsembleExperiment(sub_tasks=[task])
47+
exp = EnsembleExperiment(pending_tasks_list=[task])
4848

4949
# Injecting the corresponding specification
5050
exp.experiment_workspace.inject_files(**{"spec/ensemble.md": ensemble_spec})

rdagent/components/coder/data_science/workflow/eval.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ def evaluate(
6868
implementation.execute(env=env, entry=f"rm submission.csv scores.csv")
6969

7070
stdout = implementation.execute(env=env, entry=f"python main.py")
71+
72+
# remove EDA part
7173
stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", stdout)
7274

7375
# Check score file
@@ -85,9 +87,17 @@ def evaluate(
8587
model_set_in_folder = set(
8688
f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
8789
)
90+
91+
# Check model names (index)
8892
if model_set_in_scores != model_set_in_folder.union({"ensemble"}):
8993
score_check_text += f"\n[Error] The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {model_set_in_folder.union({'ensemble'})}\nscore_df is:\n{score_df}"
9094
score_ret_code = 1
95+
96+
# Check metric name (columns)
97+
if score_df.columns.tolist() != [self.scen.metric_name]:
98+
score_check_text += f"\n[Error] The scores dataframe does not contain the correct column names.\nCorrect columns is: ['{self.scen.metric_name}']\nBut got: {score_df.columns.tolist()}"
99+
score_ret_code = 1
100+
91101
except Exception as e:
92102
score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
93103
score_ret_code = 1
@@ -101,17 +111,6 @@ def evaluate(
101111
)
102112
stdout += "\n" + submission_check_out
103113

104-
# MLEBench Check
105-
# !!! Since we are running on a sampled dataset, mlebench check is not required.
106-
# mle_check_code = (
107-
# (DIRNAME / "eval_tests" / "mle_submission_format_test.txt")
108-
# .read_text()
109-
# .replace("<competition_id>", self.scen.competition)
110-
# )
111-
# implementation.inject_files(**{"test/mle_submission_format_test.py": mle_check_code})
112-
# stdout += "----Submission Check 2-----\n"
113-
# stdout += implementation.execute(env=mde, entry=f"python test/mle_submission_format_test.py")
114-
115114
system_prompt = T(".prompts:workflow_eval.system").r(
116115
scenario=self.scen.get_scenario_all_desc(),
117116
task_desc=target_task.get_task_information(),

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,17 @@ def evaluate(
6565
model_set_in_folder = set(
6666
f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
6767
)
68+
69+
# Check model names (index)
6870
if model_set_in_scores != model_set_in_folder.union({"ensemble"}):
6971
score_check_text += f"\n[Error] The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {model_set_in_folder.union({'ensemble'})}\nscore_df is:\n{score_df}"
7072
score_ret_code = 1
73+
74+
# Check metric name (columns)
75+
if score_df.columns.tolist() != [self.scen.metric_name]:
76+
score_check_text += f"\n[Error] The scores dataframe does not contain the correct column names.\nCorrect columns is: ['{self.scen.metric_name}']\nBut got: {score_df.columns.tolist()}"
77+
score_ret_code = 1
78+
7179
except Exception as e:
7280
logger.error(f"Error in checking the scores.csv file: {e}")
7381
score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"

rdagent/scenarios/data_science/scen/__init__.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -267,21 +267,25 @@ def _analysis_competition_description(self):
267267
self.data_type = response_json_analysis.get("Data Type", "No data type provided")
268268
self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
269269
self.dataset_description = response_json_analysis.get("Dataset Description", "No dataset description provided")
270-
self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
271270
self.submission_specifications = response_json_analysis.get(
272271
"Submission Specifications", "No submission requirements provided"
273272
)
274273
self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
274+
self.metric_description = response_json_analysis.get(
275+
"Metric Evaluation Description", "No target description provided"
276+
)
277+
self.metric_name = response_json_analysis.get("Metric Name", "custom_metric")
275278
self.metric_direction_guess = response_json_analysis.get("Metric Direction", True)
276279

277280
def get_competition_full_desc(self) -> str:
278281
return f"""Task Type: {self.task_type}
279282
Data Type: {self.data_type}
280283
Brief Description: {self.brief_description}
281284
Dataset Description: {self.dataset_description}
282-
Target Description: {self.target_description}
283285
Submission Specifications: {self.submission_specifications}
284286
Model Output Channel: {self.model_output_channel}
287+
Metric Evaluation Description: {self.metric_description}
288+
Metric Name: {self.metric_name}
285289
"""
286290

287291
@property
@@ -292,7 +296,7 @@ def background(self) -> str:
292296
data_type=self.data_type,
293297
brief_description=self.brief_description,
294298
dataset_description=self.dataset_description,
295-
target_description=self.target_description,
299+
metric_description=self.metric_description,
296300
)
297301
return background_prompt
298302

@@ -307,7 +311,8 @@ def get_scenario_all_desc(self) -> str:
307311
return T(".prompts:scenario_description").r(
308312
background=self.background,
309313
submission_specifications=self.submission_specifications,
310-
evaluation=self.target_description,
314+
evaluation=self.metric_description,
315+
metric_name=self.metric_name,
311316
metric_direction=self.metric_direction,
312317
eda_output=self.eda_output,
313318
)

rdagent/scenarios/data_science/scen/prompts.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,10 @@ competition_description_template:
3333
"Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'",
3434
"Brief Description": "A brief description of the competition",
3535
"Dataset Description": "The dataset utilized in the competition is described based on two sources: the Competition Description, which provides contextual details about the original files, and the Processed Data folder description, which outlines the structure of the dataset after processing. While there may be differences—for instance, original files mentioned in the Competition Description (e.g., .zip files) may have been extracted or restructured—your task is to interpret the new file structure accurately (do not contain any file or folder that is not in Processed Data folder description) and reconcile it with the contextual information from the Competition Description to provide a clear and updated explanation.",
36-
"Evaluation Description": "A description of the evaluation used in the competition.",
3736
"Submission Specifications": "The submission specification & sample submission file descriptions for the model to output."
3837
"Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
38+
"Metric Evaluation Description": "A precise explanation of how the submissions are scored in this competition, including how the metric is calculated and any specific considerations.",
39+
"Metric Name": "The name of the metric which this competition use for scoring the submission."
3940
"Metric direction": True or False as True means bigger metric number is better, False means smaller is better.
4041
}
4142
user: |-
@@ -57,7 +58,7 @@ competition_background: |-
5758
The data type used in this competition is {{ data_type }}.
5859
Briefly, the competition involves: {{ brief_description }}.
5960
The dataset used in this competition is: {{ dataset_description }}.
60-
Your goal in this competition is to: {{ target_description }}.
61+
The evaluation metric of this competition is: {{ metric_description }}.
6162
6263
rich_style_description: |-
6364
### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution

rdagent/scenarios/kaggle/experiment/prompts.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ kg_description_template:
1111
"Competition Features": "Two-line description of the overall features involved within the competition as background."
1212
"Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
1313
"Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
14-
"Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
14+
"Metric Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
1515
}
1616
Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
1717

rdagent/scenarios/kaggle/experiment/scenario.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def _analysis_competition_description(self):
107107
)
108108
self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
109109
self.evaluation_desc = response_json_analysis.get(
110-
"Evaluation Description", "No evaluation specification provided."
110+
"Metric Evaluation Description", "No evaluation specification provided."
111111
)
112112

113113
def get_competition_full_desc(self) -> str:
@@ -118,7 +118,7 @@ def get_competition_full_desc(self) -> str:
118118
Competition Features: {self.competition_features}
119119
Submission Specifications: {self.submission_specifications}
120120
Model Output Channel: {self.model_output_channel}
121-
Evaluation Descriptions: {self.evaluation_desc}
121+
Metric Evaluation Description: {self.evaluation_desc}
122122
Is the evaluation metric the higher the better: {evaluation_direction}
123123
"""
124124

0 commit comments

Comments
 (0)