fix some bugs

microsoft · XianBW · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
commit df3d4ac03cbcfd338e5488f904a604d166bfe800
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -32,6 +32,7 @@ def evaluate(
 
         target_task_information = target_task.get_task_information()
         metric_name = self.scen.metric_name
+
         if (
             queried_knowledge is not None
             and target_task_information in queried_knowledge.success_task_to_knowledge_dict
@@ -56,7 +57,8 @@ def evaluate(
             .render(
                 model_names=[
                     fn[:-3] for fn in implementation.file_dict.keys() if fn.startswith("model_") and "test" not in fn
-                ]
+                ],
+                metric_name=metric_name,
             )
         )
 

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
@@ -122,5 +122,7 @@ assert model_set_in_scores == set({{model_names}}).union({"ensemble"}), (
 )
 assert score_df.index.is_unique, "The scores dataframe has duplicate model names."
 assert len(score_df.columns) == 1, f"The scores dataframe should have exactly one column for the scores of the evaluation indicator, but has these columns: {score_df.columns.tolist()}"
+assert score_df.columns[0] == {{metric_name}}, f"The column name of the scores in the scores dataframe should be {{{metric_name}}}, but is '{score_df.columns[0]}'"
+
 
 print("Ensemble test end.")
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -267,11 +267,11 @@ def _analysis_competition_description(self):
         self.data_type = response_json_analysis.get("Data Type", "No data type provided")
         self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
         self.dataset_description = response_json_analysis.get("Dataset Description", "No dataset description provided")
-        self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
         self.submission_specifications = response_json_analysis.get(
             "Submission Specifications", "No submission requirements provided"
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
+        self.metric_description = response_json_analysis.get("Metric Evaluation Description", "No target description provided")
         self.metric_name = response_json_analysis.get("Metric Name", "No metric name provided")
         self.metric_direction_guess = response_json_analysis.get("Metric Direction", True)
 
@@ -280,9 +280,9 @@ def get_competition_full_desc(self) -> str:
     Data Type: {self.data_type}
     Brief Description: {self.brief_description}
     Dataset Description: {self.dataset_description}
-    Target Description: {self.target_description}
     Submission Specifications: {self.submission_specifications}
     Model Output Channel: {self.model_output_channel}
+    Metric Evaluation Description: {self.metric_description}
     Metric Name: {self.metric_name}
     """
 
@@ -294,7 +294,7 @@ def background(self) -> str:
             data_type=self.data_type,
             brief_description=self.brief_description,
             dataset_description=self.dataset_description,
-            target_description=self.target_description,
+            metric_description=self.metric_description,
         )
         return background_prompt
 
@@ -309,7 +309,7 @@ def get_scenario_all_desc(self) -> str:
         return T(".prompts:scenario_description").r(
             background=self.background,
             submission_specifications=self.submission_specifications,
-            evaluation=self.target_description,
+            evaluation=self.metric_description,
             metric_name=self.metric_name,
             metric_direction=self.metric_direction,
             eda_output=self.eda_output,

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -33,9 +33,9 @@ competition_description_template:
       "Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
       "Brief Description": "A brief description of the competition",
       "Dataset Description": "The dataset utilized in the competition is described based on two sources: the Competition Description, which provides contextual details about the original files, and the Processed Data folder description, which outlines the structure of the dataset after processing. While there may be differences—for instance, original files mentioned in the Competition Description (e.g., .zip files) may have been extracted or restructured—your task is to interpret the new file structure accurately (do not contain any file or folder that is not in Processed Data folder description) and reconcile it with the contextual information from the Competition Description to provide a clear and updated explanation.",
-      "Evaluation Description": "A description of the evaluation used in the competition.",
       "Submission Specifications": "The submission specification & sample submission file descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
+      "Metric Evaluation Description": "A precise explanation of how the submissions are scored in this competition, including how the metric is calculated and any specific considerations.",
       "Metric Name": "The name of the metric which this competition use for scoring the submission."
       "Metric direction": True or False as True means bigger metric number is better, False means smaller is better.
     }
@@ -58,7 +58,7 @@ competition_background: |-
   The data type used in this competition is {{ data_type }}.
   Briefly, the competition involves: {{ brief_description }}.
   The dataset used in this competition is: {{ dataset_description }}.
-  Your goal in this competition is to: {{ target_description }}.
+  The evaluation metric of this competition is: {{ metric_description }}.
 
 rich_style_description: |-
   ### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution

diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -11,7 +11,7 @@ kg_description_template:
       "Competition Features": "Two-line description of the overall features involved within the competition as background."
       "Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
-      "Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
+      "Metric Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
     }
     Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
 

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -107,7 +107,7 @@ def _analysis_competition_description(self):
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
         self.evaluation_desc = response_json_analysis.get(
-            "Evaluation Description", "No evaluation specification provided."
+            "Metric Evaluation Description", "No evaluation specification provided."
         )
 
     def get_competition_full_desc(self) -> str:
@@ -118,7 +118,7 @@ def get_competition_full_desc(self) -> str:
     Competition Features: {self.competition_features}
     Submission Specifications: {self.submission_specifications}
     Model Output Channel: {self.model_output_channel}
-    Evaluation Descriptions: {self.evaluation_desc}
+    Metric Evaluation Description: {self.evaluation_desc}
     Is the evaluation metric the higher the better: {evaluation_direction}
     """