feat: checkpoint selection (microsoft#744)

xuangu-fang · web-flow · commit a15a06ad6439 · 2025-04-09T09:42:30.000+08:00
* rebase selection code

* bug-free run: checkpoint selection and dynamic EDA loading

* add prototypes of various selectors, to imp. and test later

* fix EDA write bug

* move selector to from proposal.py tp seletc.py

* auto lint

* fix line-too-long typos

* aligh the design of "selection", rm extra instance check

* make auto-lint

* add non-trival selector: SOTAjump
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
@@ -27,6 +27,10 @@
 from rdagent.scenarios.data_science.dev.runner import DSCoSTEERRunner
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen.select import (
+    LatestCKPSelector,
+    SOTAJumpCKPSelector,
+)
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 
 
@@ -49,6 +53,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
         # 2) task generation from a complete solution
         # self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
+        self.ckp_selector = LatestCKPSelector()
         self.exp_gen = DSExpGen(scen)
         self.data_loader_coder = DataLoaderCoSTEER(scen)
         self.feature_coder = FeatureCoSTEER(scen)
@@ -68,7 +73,8 @@ def __init__(self, PROP_SETTING: BasePropSetting):
         super(RDLoop, self).__init__()
 
     def direct_exp_gen(self, prev_out: dict[str, Any]):
-        exp = self.exp_gen.gen(self.trace)
+        selection = self.ckp_selector.get_selection(self.trace)
+        exp = self.exp_gen.gen(self.trace, selection)
         logger.log_object(exp)
 
         # FIXME: this is for LLM debug webapp, remove this when the debugging is done.
@@ -126,6 +132,10 @@ def feedback(self, prev_out: dict[str, Any]) -> ExperimentFeedback:
         return feedback
 
     def record(self, prev_out: dict[str, Any]):
+
+        # set the DAG parent for the trace
+        self.trace.sync_dag_parent_and_hist()
+
         e = prev_out.get(self.EXCEPTION_KEY, None)
         if e is None:
             self.trace.hist.append((prev_out["running"], prev_out["feedback"]))
diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -74,7 +74,7 @@ def implement_one_task(
         )
 
         # Generate code with knowledge integration
-        competition_info = self.scen.get_scenario_all_desc()
+        competition_info = self.scen.get_scenario_all_desc(eda_output=workspace.file_dict.get("EDA.md", None))
         system_prompt = T(".prompts:ensemble_coder.system").r(
             task_desc=ensemble_information_str,
             competition_info=competition_info,
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
@@ -61,7 +61,7 @@ def implement_one_task(
 
         # 2. code
         system_prompt = T(".prompts:feature_coder.system").r(
-            competition_info=self.scen.get_scenario_all_desc(),
+            competition_info=self.scen.get_scenario_all_desc(eda_output=workspace.file_dict.get("EDA.md", None)),
             task_desc=feature_information_str,
             data_loader_code=workspace.file_dict.get("load_data.py"),
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
@@ -62,7 +62,7 @@ def implement_one_task(
         # 2. code
         system_prompt = T(".prompts:model_coder.system").r(
             task_desc=model_information_str,
-            competition_info=self.scen.get_scenario_all_desc(),
+            competition_info=self.scen.get_scenario_all_desc(eda_output=workspace.file_dict.get("EDA.md", None)),
             data_loader_code=workspace.file_dict.get("load_data.py"),
             feature_code=workspace.file_dict["feature.py"],
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -66,7 +66,7 @@ def implement_one_task(
         workspace: FBWorkspace | None = None,
         prev_task_feedback: CoSTEERSingleFeedback | None = None,
     ) -> dict[str, str]:
-        competition_info = self.scen.get_scenario_all_desc()
+        competition_info = self.scen.get_scenario_all_desc(eda_output=workspace.file_dict.get("EDA.md", None))
         runtime_environment = self.scen.get_runtime_environment()
         data_folder_info = self.scen.processed_data_folder_description
         pipeline_task_info = target_task.get_task_information()
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -119,8 +119,10 @@ def evaluate(
                 )
         stdout += "\n" + submission_check_out
 
+        eda_output = implementation.file_dict.get("EDA.md", None)
+
         system_prompt = T(".prompts:pipeline_eval.system").r(
-            scenario=self.scen.get_scenario_all_desc(),
+            scenario=self.scen.get_scenario_all_desc(eda_output=eda_output),
             task_desc=target_task.get_task_information(),
             spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
         )
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -67,7 +67,7 @@ def implement_one_task(
     ) -> dict[str, str]:
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
-        competition_info = self.scen.get_scenario_all_desc()
+        competition_info = self.scen.get_scenario_all_desc(eda_output=workspace.file_dict.get("EDA.md", None))
         runtime_environment = self.scen.get_runtime_environment()
         data_folder_info = self.scen.processed_data_folder_description
         data_loader_task_info = target_task.get_task_information()
@@ -231,5 +231,9 @@ def develop(self, exp):
         stdout = new_exp.experiment_workspace.execute(env=env, entry=f"python test/data_loader_test.py")
         match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===", stdout, re.DOTALL)
         eda_output = match.groups()[1] if match else None
-        self.scen.eda_output = eda_output
+        if eda_output is not None:
+            new_exp.experiment_workspace.inject_files(**{"EDA.md": eda_output})
+        else:
+            eda_output = "No EDA output."
+            new_exp.experiment_workspace.inject_files(**{"EDA.md": eda_output})
         return new_exp
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -59,7 +59,7 @@ def implement_one_task(
         # 2. code
         system_prompt = T(".prompts:workflow_coder.system").r(
             task_desc=workflow_information_str,
-            competition_info=self.scen.get_scenario_all_desc(),
+            competition_info=self.scen.get_scenario_all_desc(eda_output=workspace.file_dict.get("EDA.md", None)),
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=PythonAgentOut.get_spec(),
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
@@ -127,7 +127,8 @@ def evaluate(
         stdout += "\n" + submission_check_out
 
         system_prompt = T(".prompts:workflow_eval.system").r(
-            scenario=self.scen.get_scenario_all_desc(),
+            # here we pass `None` to `eda_output` because we do not have nor need EDA output for workflow.
+            scenario=self.scen.get_scenario_all_desc(eda_output=None),
             task_desc=target_task.get_task_information(),
             spec=(
                 implementation.file_dict["spec/workflow.md"]
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -1,4 +1,4 @@
-""" """
+# TODO: remove `self.scen` if traces will be passed into the instance.
 
 from __future__ import annotations
 
@@ -112,9 +112,16 @@ def __str__(self) -> str:
 
 
 class Trace(Generic[ASpecificScen, ASpecificKB]):
+    NodeType = tuple[Experiment, ExperimentFeedback]  # Define NodeType as a new type representing the tuple
+
     def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
         self.scen: ASpecificScen = scen
-        self.hist: list[tuple[Experiment, ExperimentFeedback]] = []
+        self.hist: list[Trace.NodeType] = (
+            []
+        )  # List of tuples containing experiments and their feedback, organized over time.
+        self.dag_parent: list[tuple[int, ...]] = []  # List of tuples representing parent indices in the DAG structure.
+        # (,) represents no parent; (1,) presents one parent; (1, 2) represents two parents.
+
         # TODO: self.hist is 2-tuple now, remove hypothesis from it, change old code for this later.
         self.knowledge_base: ASpecificKB | None = knowledge_base
 
@@ -128,13 +135,32 @@ def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experim
         return None, None
 
 
+class CheckpointSelector:
+    """
+    In the trace, we may start from any check point (we'll represent it as a variable `from_checkpoint_idx`)
+    """
+
+    @abstractmethod
+    def get_selection(self, trace: Trace) -> tuple[int, ...] | None:
+        """
+        checkpoint_idx represents the place where we want to create a new node.
+        the return value should be the idx of target node (the parent of the new generating node).
+        - `(-1, )` represents starting from the latest trial in the trace - default value
+        - `(idx, )` represents starting from the `idx`-th trial in the trace.
+        - `None` represents starting from scratch (start a new trace)
+
+
+        - More advanced selection strategies in `select.py`
+        """
+
+
 class ExpGen(ABC):
 
     def __init__(self, scen: Scenario) -> None:
         self.scen = scen
 
     @abstractmethod
-    def gen(self, trace: Trace) -> Experiment:
+    def gen(self, trace: Trace, selection: tuple[int, ...] = (-1,)) -> Experiment:
         """
         Generate the experiment based on the trace.
 
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -86,7 +86,10 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                         decision=False,
                     )
 
-        system_prompt = T(".prompts:exp_feedback.system").r(scenario=self.scen.get_scenario_all_desc())
+        eda_output = exp.experiment_workspace.file_dict.get("EDA.md", None)
+        system_prompt = T(".prompts:exp_feedback.system").r(
+            scenario=self.scen.get_scenario_all_desc(eda_output=eda_output)
+        )
         user_prompt = T(".prompts:exp_feedback.user").r(
             sota_desc=sota_desc,
             cur_exp=exp,
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -124,7 +124,7 @@ def evaluate(
         stdout += f"\nMLEBench submission check:\n{submission_check_out}\nIf MLEBench submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
 
         system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
-            scenario=self.scen.get_scenario_all_desc(),
+            scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
             task_desc=target_task.get_task_information(),
         )
         user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py b/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py
@@ -20,7 +20,11 @@ class DSExpGen(ExpGen):
     def __init__(self, scen: DataScienceScen) -> None:
         super().__init__(scen)
 
-    def gen(self, trace: DSTrace) -> DSExperiment:
+    def gen(self, trace: DSTrace, selection: tuple[int, ...] = (-1,)) -> DSExperiment:
+
+        # set the current selection for the trace
+        # handy design:dynamically change the "current selection" attribute of the trace, and we donot need to pass selection as an argument to other functions
+        trace.set_current_selection(selection)
 
         if DS_RD_SETTING.proposal_version not in ["v1", "v2"]:
             return import_class(DS_RD_SETTING.proposal_version)(scen=self.scen).gen(trace=trace)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/draft.py b/rdagent/scenarios/data_science/proposal/exp_gen/draft.py
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/select.py b/rdagent/scenarios/data_science/proposal/exp_gen/select.py
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ def implement_one_task(`
`74`	`74`	`)`
`75`	`75`
`76`	`76`	`# Generate code with knowledge integration`
`77`		`- competition_info = self.scen.get_scenario_all_desc()`
	`77`	`+ competition_info = self.scen.get_scenario_all_desc(eda_output=workspace.file_dict.get("EDA.md", None))`
`78`	`78`	`system_prompt = T(".prompts:ensemble_coder.system").r(`
`79`	`79`	`task_desc=ensemble_information_str,`
`80`	`80`	`competition_info=competition_info,`
Original file line number	Diff line number	Diff line change
`@@ -119,8 +119,10 @@ def evaluate(`
`119`	`119`	`)`
`120`	`120`	`stdout += "\n" + submission_check_out`
`121`	`121`
	`122`	`+ eda_output = implementation.file_dict.get("EDA.md", None)`
	`123`	`+`
`122`	`124`	`system_prompt = T(".prompts:pipeline_eval.system").r(`
`123`		`- scenario=self.scen.get_scenario_all_desc(),`
	`125`	`+ scenario=self.scen.get_scenario_all_desc(eda_output=eda_output),`
`124`	`126`	`task_desc=target_task.get_task_information(),`
`125`	`127`	`spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),`
`126`	`128`	`)`