fix: add metric name check for valid scores (microsoft#724)

XianBW · TPLin22 · WinstonLiyt · web-flow · commit 450571e86e68 · 2025-03-27T19:36:57.000+08:00
* update metric_name

* fix some bugs

* add an evaluation in workflow

* add an evalution in runner

* fix ci

* test change

* fix CI

---------

Co-authored-by: TPLin22 &lt;tplin2@163.com&gt;
Co-authored-by: yuanteli &lt;1957922024@qq.com&gt;
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -31,6 +31,8 @@ def evaluate(
     ) -> EnsembleEvalFeedback:
 
         target_task_information = target_task.get_task_information()
+        metric_name = self.scen.metric_name
+
         if (
             queried_knowledge is not None
             and target_task_information in queried_knowledge.success_task_to_knowledge_dict
@@ -55,7 +57,8 @@ def evaluate(
             .render(
                 model_names=[
                     fn[:-3] for fn in implementation.file_dict.keys() if fn.startswith("model_") and "test" not in fn
-                ]
+                ],
+                metric_name=metric_name,
             )
         )
 
@@ -73,6 +76,7 @@ def evaluate(
         system_prompt = T(".prompts:ensemble_eval.system").r(
             task_desc=target_task_information,
             test_code=test_code,
+            metric_name=metric_name,
             code=implementation.file_dict["ensemble.py"],
             workflow_stdout=workflow_stdout,
             workflow_code=implementation.all_codes,
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
@@ -123,6 +123,7 @@ assert model_set_in_scores == set({{model_names}}).union({"ensemble"}), (
     f"The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {{model_names}} + ['ensemble']\nscore_df is:\n{score_df}"
 )
 assert score_df.index.is_unique, "The scores dataframe has duplicate model names."
-assert len(score_df.columns) == 1, f"The scores dataframe should have exactly one column for the scores of the evaluation indicator, but has these columns: {score_df.columns.tolist()}"
+assert score_df.columns.tolist() == ["{{metric_name}}"], f"The column names of the scores dataframe should be ['{{metric_name}}'], but is '{score_df.columns.tolist()}'"
+
 
 print("Ensemble test end.")
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -96,9 +96,13 @@ ensemble_eval:
     You should evaluate both the ensemble test results and the overall workflow results. **Approve the code only if both tests pass.**
     {% endif %}
 
+    The metric used for scoring the predictions:
+    **{{ metric_name }}**
+
     ## Evaluation Criteria
     - You will be given the standard output (`stdout`) from the ensemble test and, if applicable, the workflow test.
     - Code should have no try-except blocks because they can hide errors.
+    - Check whether the code implement the scoring process using the given metric.
     - The stdout includes the local variable values from the ensemble code execution. Check whether the validation score is calculated correctly.
     
     Please respond with your feedback in the following JSON format and order
diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
@@ -44,7 +44,7 @@ def develop_one_competition(competition: str):
         """,
     )
 
-    exp = EnsembleExperiment(sub_tasks=[task])
+    exp = EnsembleExperiment(pending_tasks_list=[task])
 
     # Injecting the corresponding specification
     exp.experiment_workspace.inject_files(**{"spec/ensemble.md": ensemble_spec})
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
@@ -68,6 +68,8 @@ def evaluate(
         implementation.execute(env=env, entry=f"rm submission.csv scores.csv")
 
         stdout = implementation.execute(env=env, entry=f"python main.py")
+
+        # remove EDA part
         stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", stdout)
 
         # Check score file
@@ -85,9 +87,17 @@ def evaluate(
                 model_set_in_folder = set(
                     f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
                 )
+
+                # Check model names (index)
                 if model_set_in_scores != model_set_in_folder.union({"ensemble"}):
                     score_check_text += f"\n[Error] The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {model_set_in_folder.union({'ensemble'})}\nscore_df is:\n{score_df}"
                     score_ret_code = 1
+
+                # Check metric name (columns)
+                if score_df.columns.tolist() != [self.scen.metric_name]:
+                    score_check_text += f"\n[Error] The scores dataframe does not contain the correct column names.\nCorrect columns is: ['{self.scen.metric_name}']\nBut got: {score_df.columns.tolist()}"
+                    score_ret_code = 1
+
             except Exception as e:
                 score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
                 score_ret_code = 1
@@ -101,17 +111,6 @@ def evaluate(
         )
         stdout += "\n" + submission_check_out
 
-        # MLEBench Check
-        # !!! Since we are running on a sampled dataset, mlebench check is not required.
-        # mle_check_code = (
-        #     (DIRNAME / "eval_tests" / "mle_submission_format_test.txt")
-        #     .read_text()
-        #     .replace("<competition_id>", self.scen.competition)
-        # )
-        # implementation.inject_files(**{"test/mle_submission_format_test.py": mle_check_code})
-        # stdout += "----Submission Check 2-----\n"
-        # stdout += implementation.execute(env=mde, entry=f"python test/mle_submission_format_test.py")
-
         system_prompt = T(".prompts:workflow_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(),
             task_desc=target_task.get_task_information(),
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -65,9 +65,17 @@ def evaluate(
                 model_set_in_folder = set(
                     f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
                 )
+
+                # Check model names (index)
                 if model_set_in_scores != model_set_in_folder.union({"ensemble"}):
                     score_check_text += f"\n[Error] The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {model_set_in_folder.union({'ensemble'})}\nscore_df is:\n{score_df}"
                     score_ret_code = 1
+
+                # Check metric name (columns)
+                if score_df.columns.tolist() != [self.scen.metric_name]:
+                    score_check_text += f"\n[Error] The scores dataframe does not contain the correct column names.\nCorrect columns is: ['{self.scen.metric_name}']\nBut got: {score_df.columns.tolist()}"
+                    score_ret_code = 1
+
             except Exception as e:
                 logger.error(f"Error in checking the scores.csv file: {e}")
                 score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -267,21 +267,25 @@ def _analysis_competition_description(self):
         self.data_type = response_json_analysis.get("Data Type", "No data type provided")
         self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
         self.dataset_description = response_json_analysis.get("Dataset Description", "No dataset description provided")
-        self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
         self.submission_specifications = response_json_analysis.get(
             "Submission Specifications", "No submission requirements provided"
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
+        self.metric_description = response_json_analysis.get(
+            "Metric Evaluation Description", "No target description provided"
+        )
+        self.metric_name = response_json_analysis.get("Metric Name", "custom_metric")
         self.metric_direction_guess = response_json_analysis.get("Metric Direction", True)
 
     def get_competition_full_desc(self) -> str:
         return f"""Task Type: {self.task_type}
     Data Type: {self.data_type}
     Brief Description: {self.brief_description}
     Dataset Description: {self.dataset_description}
-    Target Description: {self.target_description}
     Submission Specifications: {self.submission_specifications}
     Model Output Channel: {self.model_output_channel}
+    Metric Evaluation Description: {self.metric_description}
+    Metric Name: {self.metric_name}
     """
 
     @property
@@ -292,7 +296,7 @@ def background(self) -> str:
             data_type=self.data_type,
             brief_description=self.brief_description,
             dataset_description=self.dataset_description,
-            target_description=self.target_description,
+            metric_description=self.metric_description,
         )
         return background_prompt
 
@@ -307,7 +311,8 @@ def get_scenario_all_desc(self) -> str:
         return T(".prompts:scenario_description").r(
             background=self.background,
             submission_specifications=self.submission_specifications,
-            evaluation=self.target_description,
+            evaluation=self.metric_description,
+            metric_name=self.metric_name,
             metric_direction=self.metric_direction,
             eda_output=self.eda_output,
         )
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -33,9 +33,10 @@ competition_description_template:
       "Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
       "Brief Description": "A brief description of the competition",
       "Dataset Description": "The dataset utilized in the competition is described based on two sources: the Competition Description, which provides contextual details about the original files, and the Processed Data folder description, which outlines the structure of the dataset after processing. While there may be differences—for instance, original files mentioned in the Competition Description (e.g., .zip files) may have been extracted or restructured—your task is to interpret the new file structure accurately (do not contain any file or folder that is not in Processed Data folder description) and reconcile it with the contextual information from the Competition Description to provide a clear and updated explanation.",
-      "Evaluation Description": "A description of the evaluation used in the competition.",
       "Submission Specifications": "The submission specification & sample submission file descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
+      "Metric Evaluation Description": "A precise explanation of how the submissions are scored in this competition, including how the metric is calculated and any specific considerations.",
+      "Metric Name": "The name of the metric which this competition use for scoring the submission."
       "Metric direction": True or False as True means bigger metric number is better, False means smaller is better.
     }
   user: |-
@@ -57,7 +58,7 @@ competition_background: |-
   The data type used in this competition is {{ data_type }}.
   Briefly, the competition involves: {{ brief_description }}.
   The dataset used in this competition is: {{ dataset_description }}.
-  Your goal in this competition is to: {{ target_description }}.
+  The evaluation metric of this competition is: {{ metric_description }}.
 
 rich_style_description: |-
   ### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution
diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -11,7 +11,7 @@ kg_description_template:
       "Competition Features": "Two-line description of the overall features involved within the competition as background."
       "Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
-      "Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
+      "Metric Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
     }
     Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
 
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -107,7 +107,7 @@ def _analysis_competition_description(self):
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
         self.evaluation_desc = response_json_analysis.get(
-            "Evaluation Description", "No evaluation specification provided."
+            "Metric Evaluation Description", "No evaluation specification provided."
         )
 
     def get_competition_full_desc(self) -> str:
@@ -118,7 +118,7 @@ def get_competition_full_desc(self) -> str:
     Competition Features: {self.competition_features}
     Submission Specifications: {self.submission_specifications}
     Model Output Channel: {self.model_output_channel}
-    Evaluation Descriptions: {self.evaluation_desc}
+    Metric Evaluation Description: {self.evaluation_desc}
     Is the evaluation metric the higher the better: {evaluation_direction}
     """
 

Original file line number	Diff line number	Diff line change
`@@ -123,6 +123,7 @@ assert model_set_in_scores == set({{model_names}}).union({"ensemble"}), (`
`123`	`123`	`f"The scores dataframe does not contain the correct model names as index.\ncorrect model names are: {{model_names}} + ['ensemble']\nscore_df is:\n{score_df}"`
`124`	`124`	`)`
`125`	`125`	`assert score_df.index.is_unique, "The scores dataframe has duplicate model names."`
`126`		`-assert len(score_df.columns) == 1, f"The scores dataframe should have exactly one column for the scores of the evaluation indicator, but has these columns: {score_df.columns.tolist()}"`
	`126`	`+assert score_df.columns.tolist() == ["{{metric_name}}"], f"The column names of the scores dataframe should be ['{{metric_name}}'], but is '{score_df.columns.tolist()}'"`
	`127`	`+`
`127`	`128`
`128`	`129`	`print("Ensemble test end.")`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ def develop_one_competition(competition: str):`
`44`	`44`	`""",`
`45`	`45`	`)`
`46`	`46`
`47`		`- exp = EnsembleExperiment(sub_tasks=[task])`
	`47`	`+ exp = EnsembleExperiment(pending_tasks_list=[task])`
`48`	`48`
`49`	`49`	`# Injecting the corresponding specification`
`50`	`50`	`exp.experiment_workspace.inject_files(**{"spec/ensemble.md": ensemble_spec})`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ kg_description_template:`
`11`	`11`	`"Competition Features": "Two-line description of the overall features involved within the competition as background."`
`12`	`12`	`"Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."`
`13`	`13`	`"Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."`
`14`		- "Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
	`14`	+ "Metric Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
`15`	`15`	`}`
`16`	`16`	`Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.`
`17`	`17`