feat: trace merging (microsoft#836)

you-n-g · web-flow · commit a3d547369e40 · 2025-04-29T09:30:45.000+08:00
* feat: runnalbe -- add exp_gen_cls param, get_leaves and merge exp gen functionalities

* fix: remove unused scenario_desc and update YAML task labels

* feat: override selection and update merge task description

* lint

* lint

* lint

* lint

* lint

* fix: log competition setting to enable mle_summary

* fix name error
diff --git a/.gitignore b/.gitignore
@@ -112,6 +112,7 @@ celerybeat.pid
 
 # Environments
 .env*
+*.env
 .venv
 ^env/
 venv/
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
@@ -166,6 +166,7 @@ def record(self, prev_out: dict[str, Any]):
                     self.trace = DSTrace(scen=self.trace.scen, knowledge_base=self.trace.knowledge_base)
         logger.log_object(self.trace, tag="trace")
         logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")
+
         if DS_RD_SETTING.enable_knowledge_base and DS_RD_SETTING.knowledge_base_version == "v1":
             logger.log_object(self.trace.knowledge_base, tag="knowledge_base")
             self.trace.knowledge_base.dump()
@@ -228,6 +229,7 @@ def load(
         replace_timer: bool = True,
     ) -> "LoopBase":
         session = super().load(path, output_path, do_truncate, replace_timer)
+        logger.log_object(DS_RD_SETTING.competition, tag="competition")  # NOTE: necessary to make mle_summary work.
         if DS_RD_SETTING.enable_knowledge_base and DS_RD_SETTING.knowledge_base_version == "v1":
             session.trace.knowledge_base = DSKnowledgeBase(
                 path=DS_RD_SETTING.knowledge_base_path, idea_pool_json_path=DS_RD_SETTING.idea_pool_json_path
@@ -257,6 +259,7 @@ def main(
     do_truncate=True,
     timeout=None,
     replace_timer=True,
+    exp_gen_cls: str | None = None,
 ):
     """
 
@@ -275,6 +278,10 @@ def main(
     competition :
     do_truncate :
         If set to True, the logger will truncate the future log messages by calling `logger.storage.truncate`.
+    replace_timer :
+        If session is loaded, should we replace the timer with session.timer
+    exp_gen_cls :
+        When we have different stages, we can replace the exp_gen with the new proposal
 
 
     Auto R&D Evolving loop for models in a Kaggle scenario.
@@ -300,6 +307,11 @@ def main(
         kaggle_loop = DataScienceRDLoop(DS_RD_SETTING)
     else:
         kaggle_loop = DataScienceRDLoop.load(path, output_path, do_truncate, replace_timer)
+
+    # replace exp_gen if we have new class
+    if exp_gen_cls is not None:
+        kaggle_loop.exp_gen = import_class(exp_gen_cls)(kaggle_loop.exp_gen.scen)
+
     kaggle_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout)
 
 
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -5,7 +5,7 @@
 
 from rdagent.core.experiment import Experiment, FBWorkspace, Task
 
-COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
+COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow", "Pipeline"]
 
 
 class DSExperiment(Experiment[Task, FBWorkspace, FBWorkspace]):
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py b/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py
@@ -2,7 +2,7 @@
 from rdagent.core.proposal import ExpGen
 from rdagent.core.utils import import_class
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.proposal.exp_gen.base import DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis, DSTrace
 from rdagent.scenarios.data_science.proposal.exp_gen.draft import DSDraftExpGen
 from rdagent.scenarios.data_science.proposal.exp_gen.proposal import (
     DSProposalV1ExpGen,
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -68,6 +68,21 @@ def get_current_selection(self) -> tuple[int, ...]:
     def set_current_selection(self, selection: tuple[int, ...]) -> None:
         self.current_selection = selection
 
+    def get_leaves(self) -> list[int, ...]:
+        """
+        Get the indices of nodes (in hist) that have no children—i.e., "leaves" of current DAG.
+        Returns:
+            tuple of ints: Indices of leaf nodes.
+            - Leaves with lower index comes first.
+        """
+        # Build a set of all parent indices found in dag_parent (skip empty tuples which represent roots)
+        parent_indices = set(idx for parents in self.dag_parent for idx in parents)
+        # All node indices
+        all_indices = set(range(len(self.hist)))
+        # The leaf nodes have no children, so they are not present as parents of any other node
+        leaves = list(sorted(all_indices - parent_indices))
+        return leaves
+
     def sync_dag_parent_and_hist(
         self,
     ) -> None:
@@ -90,7 +105,9 @@ def sync_dag_parent_and_hist(
             self.dag_parent.append((current_node_idx,))
 
     def retrieve_search_list(
-        self, search_type: Literal["all", "ancestors"] = "ancestors"
+        self,
+        search_type: Literal["all", "ancestors"] = "ancestors",
+        selection: tuple[int, ...] | None = None,
     ) -> list[tuple[DSExperiment, ExperimentFeedback]]:
         """
         Retrieve the search list based on the selection and search_type.
@@ -108,7 +125,9 @@ def retrieve_search_list(
             The search list.
         """
 
-        selection = self.get_current_selection()
+        if selection is None:
+            selection = self.get_current_selection()
+
         if selection is None:
             # selection is None, which means we switch to a new trace, which is not implemented yet
             return []
@@ -175,11 +194,12 @@ def experiment_and_feedback_list_after_init(
         self,
         return_type: Literal["sota", "failed", "all"],
         search_type: Literal["all", "ancestors"] = "all",
+        selection: tuple[int, ...] | None = None,
     ) -> list[tuple[DSExperiment, ExperimentFeedback]]:
         """
         Retrieve a list of experiments and feedbacks based on the return_type.
         """
-        search_list = self.retrieve_search_list(search_type)
+        search_list = self.retrieve_search_list(search_type, selection=selection)
 
         final_component = self.COMPLETE_ORDER[-1]
         has_final_component = True if DS_RD_SETTING.coder_on_whole_pipeline else False
@@ -199,6 +219,7 @@ def experiment_and_feedback_list_after_init(
     def sota_experiment(
         self,
         search_type: Literal["all", "ancestors"] = "ancestors",
+        selection: tuple[int, ...] | None = None,
     ) -> DSExperiment | None:
         """
 
@@ -207,7 +228,7 @@ def sota_experiment(
         Experiment or None
             The experiment result if found, otherwise None.
         """
-        search_list = self.retrieve_search_list(search_type)
+        search_list = self.retrieve_search_list(search_type, selection=selection)
 
         if DS_RD_SETTING.coder_on_whole_pipeline or self.next_incomplete_component() is None:
             for exp, ef in search_list[::-1]:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/merge.py b/rdagent/scenarios/data_science/proposal/exp_gen/merge.py
@@ -0,0 +1,57 @@
+"""Merge the version in different traces"""
+
+from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
+from rdagent.core.proposal import ExpGen
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis, DSTrace
+from rdagent.utils.agent.tpl import T
+
+
+class MergeExpGen(ExpGen):
+    def gen(self, trace: DSTrace, selection: tuple[int, ...] = (-1,)) -> DSExperiment:
+        # Ignore the selection argument and use all leaves instead.
+        leaves: list[int] = trace.get_leaves()
+        trace.set_current_selection((leaves[0],))  # override the current selection.
+
+        # assuming merging the first and sencond trace.
+        sota_exp = trace.sota_experiment(selection=(leaves[0],))
+        exp_to_merge = trace.sota_experiment(selection=(leaves[1],))
+
+        # scenario_desc = trace.scen.get_scenario_all_desc()
+        # scenario_desc is not needed in task description. So we have to do it.
+
+        sota_exp_desc = T("scenarios.data_science.share:describe.exp").r(
+            exp=sota_exp,
+            heading="Best of previous exploration of the scenario",
+        )
+        exp_to_merge_desc = T("scenarios.data_science.share:describe.exp").r(
+            exp=exp_to_merge,
+            heading="A solution that to be merged into previous best solution",
+        )
+
+        exp_and_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
+            exp_and_feedback_list=trace.experiment_and_feedback_list_after_init(
+                return_type="sota", selection=(leaves[1],)
+            ),
+            type="success",
+        )
+
+        task = PipelineTask(
+            description=T("scenarios.data_science.proposal.exp_gen.merge:task").r(
+                sota_exp_desc=sota_exp_desc,
+                exp_to_merge_desc=exp_to_merge_desc,
+                exp_and_feedback_list_desc=exp_and_feedback_list_desc,
+            )
+        )
+
+        exp = DSExperiment(
+            pending_tasks_list=[[task]],
+            hypothesis=DSHypothesis(
+                component="Pipeline",
+                hypothesis="Merging two different versions of solutions would get the best of both sides and result in a better solution",
+            ),
+        )
+
+        if sota_exp is not None:
+            exp.experiment_workspace.inject_code_from_file_dict(sota_exp.experiment_workspace)
+        return exp
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/merge.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/merge.yaml
@@ -0,0 +1,20 @@
+task: |-
+  {% include "scenarios.data_science.share:scen.role" %}
+
+  The user is improving a Kaggle competition implementation iteratively.
+  Your task is to merge two solutions to create a better version. We expect the merged version to perform better than both given solutions.
+
+  You will be given:
+  1) Previous Main Solution: this is the main solution you will build on to create an improved version;
+  2) Solution to be merged: another solution that you will combine with the previous main solution.
+    - Solution: the approach or method used in this solution.
+    - Successful iterations: the steps or changes that led to the success of `Solution to be merged`.
+
+  # Previous Main Solution
+  {{ sota_exp_desc }}
+
+  # Solution to be merged
+  ## Solution Descrioption:
+  {{ exp_to_merge_desc }}
+  ## Successful iterations:
+  {{ exp_and_feedback_list_desc }}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/naive.py b/rdagent/scenarios/data_science/proposal/exp_gen/naive.py
@@ -2,7 +2,6 @@
 The most naive way to design experiments
 """
 
-from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
 from rdagent.core.proposal import ExpGen
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -14,9 +14,7 @@
 from rdagent.oai.llm_utils import APIBackend, md5_hash
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis, DSTrace
-from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import (
-    DSIdea,
-)
+from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSIdea
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.repo.diff import generate_diff_from_dict
 from rdagent.utils.workflow import wait_retry
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -332,4 +332,4 @@ component_spec:
 
 guidelines:
   coding: |-
-    You might receive exploratory data analysis (EDA) details about the source data. Do not use this EDA information to create assertions or raise errors. We might generate sample data for quick coding (so your code may run on sample data which is part of the full-size data), but remember that the EDA details are based on the full-size data.
+    You might receive exploratory data analysis (EDA) details about the source data. Do not use this EDA information to create assertions or raise errors. We might generate sample data for quick coding (so your code may run on sample data which is part of the full-size data), but remember that the EDA details are based on the full-size data.
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
@@ -214,10 +214,13 @@ def load(
             session = cast(LoopBase, pickle.load(f))
 
         # set session folder
-        if output_path:
-            output_path = Path(output_path)
-            output_path.mkdir(parents=True, exist_ok=True)
-            session.session_folder = output_path / "__session__"
+        # - P1: if output_path explicitly specified.
+        # - P2: RD_AGENT_SETTINGS.log_trace_path
+        output_path_value = output_path if output_path is not None else RD_AGENT_SETTINGS.log_trace_path
+        if output_path_value is not None:
+            output_path_path = Path(output_path_value)
+            output_path_path.mkdir(parents=True, exist_ok=True)
+            session.session_folder = output_path_path / "__session__"
 
         # set trace path
         logger.set_trace_path(session.session_folder.parent)

-Original file line number
+Diff line change
 # Environments
 .env*
 +*.env
 .venv
 ^env/
 venv/