microsoft · qew21 · Jun 4, 2025 · May 9, 2025 · May 29, 2025 · May 29, 2025
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -3,15 +3,13 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Generic, TypeVar
+from typing import Generic, List, Tuple, TypeVar
 
 from rdagent.core.evaluation import Feedback
 from rdagent.core.experiment import ASpecificExp, Experiment
 from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.core.scenario import Scenario
 
-# class data_ana: XXX
-
 
 class Hypothesis:
     """
@@ -105,6 +103,7 @@ def __str__(self) -> str:
 
 class Trace(Generic[ASpecificScen, ASpecificKB]):
     NodeType = tuple[Experiment, ExperimentFeedback]  # Define NodeType as a new type representing the tuple
+    NEW_ROOT: Tuple = ()
 
     def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
         self.scen: ASpecificScen = scen
@@ -116,6 +115,7 @@ def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = Non
 
         # TODO: self.hist is 2-tuple now, remove hypothesis from it, change old code for this later.
         self.knowledge_base: ASpecificKB | None = knowledge_base
+        self.current_selection: tuple[int, ...] = (-1,)
 
     def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
@@ -126,6 +126,81 @@ def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experim
 
         return None, None
 
+    def is_selection_new_tree(self, selection: tuple[int, ...] | None = None) -> bool:
+        """
+        Check if the current trace is a new tree.
+        - selection maybe (-1,) when the dag_parent is empty.
+        """
+        if selection is None:
+            selection = self.get_current_selection()
+
+        return selection == self.NEW_ROOT or len(self.dag_parent) == 0
+
+    def get_current_selection(self) -> tuple[int, ...]:
+        return self.current_selection
+
+    def set_current_selection(self, selection: tuple[int, ...]) -> None:
+        self.current_selection = selection
+
+    def collect_all_ancestors(
+        self,
+        selection: tuple[int, ...] | None = None,
+    ) -> list[Trace.NodeType]:
+        """
+        Collect all ancestors of the given selection.
+        The return list follows the order of [root->...->parent->current_node].
+        """
+        if selection is None:
+            selection = self.get_current_selection()
+
+        if self.is_selection_new_tree(selection):
+            return []
+
+        else:
+            all_ancestors: list[Trace.NodeType] = []
+
+            # start from the latest selection
+            current_node_idx = selection[0]
+
+            # add the current node to the list
+            all_ancestors.insert(0, self.hist[current_node_idx])
+
+            parent_idx = self.dag_parent[current_node_idx]
+
+            while len(parent_idx) > 0:
+                all_ancestors.insert(0, self.hist[parent_idx[0]])
+                parent_idx = self.dag_parent[parent_idx[0]]
+
+        return all_ancestors
+
+    def exp2idx(self, exp: Experiment | List[Experiment]) -> int | List[int] | None:
+        if isinstance(exp, list):
+            exps: List[Experiment] = exp
+            return [i for i, (_exp, _) in enumerate(self.hist) if _exp in exps]
+        else:
+            for i, (_exp, _) in enumerate(self.hist):
+                if _exp == exp:
+                    return i
+        return None
+
+    def idx2exp(self, idx: int | List[int]) -> Experiment | List[Experiment]:
+        if isinstance(idx, list):
+            idxs: List[int] = idx
+            return [self.hist[_idx][0] for _idx in idxs]
+        else:
+            return self.hist[idx][0]
+
+    def is_parent(self, parent_idx: int, child_idx: int) -> bool:
+        ancestors = self.get_parents(child_idx)
+        return parent_idx in ancestors
+
+    def get_parents(self, child_idx: int) -> List[int]:
+        ancestors = self.collect_all_ancestors((child_idx,))
+        ancestor_exps = [exp for exp, _ in ancestors]
+        parent_idxs = self.exp2idx(ancestor_exps)
+        assert isinstance(parent_idxs, list)
+        return parent_idxs
+
 
 class CheckpointSelector:
     """

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import Literal
+from typing import List, Literal
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.evolving_framework import KnowledgeBase
@@ -61,21 +61,13 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
 
         self.knowledge_base = knowledge_base
 
-        self.current_selection: tuple[int, ...] = (-1,)
-
         self.sota_exp_to_submit: DSExperiment | None = None  # grab the global best exp to submit
 
     COMPLETE_ORDER = ("DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow")
 
     def set_sota_exp_to_submit(self, exp: DSExperiment) -> None:
         self.sota_exp_to_submit = exp
 
-    def get_current_selection(self) -> tuple[int, ...]:
-        return self.current_selection
-
-    def set_current_selection(self, selection: tuple[int, ...]) -> None:
-        self.current_selection = selection
-
     @property
     def sub_trace_count(self) -> int:
         return len(self.get_leaves())
@@ -144,50 +136,11 @@ def retrieve_search_list(
             return self.hist
 
         elif search_type == "ancestors":
-
-            if selection is None:
-                selection = self.get_current_selection()
-
-            if len(selection) == 0:
-                # selection is (), which means we switch to a new trace
-                return []
-
             return self.collect_all_ancestors(selection)
 
         else:
             raise ValueError(f"Invalid search type: {search_type}")
 
-    def collect_all_ancestors(
-        self,
-        selection: tuple[int, ...] | None = None,
-    ) -> list[tuple[DSExperiment, ExperimentFeedback]]:
-        """
-        Collect all ancestors of the given selection.
-        The return list follows the order of [root->...->parent->current_node].
-        """
-        if selection is None:
-            selection = self.get_current_selection()
-
-        if len(self.dag_parent) == 0:
-            return []
-
-        else:
-            all_ancestors = []
-
-            # start from the latest selection
-            current_node_idx = selection[0]
-
-            # add the current node to the list
-            all_ancestors.insert(0, self.hist[current_node_idx])
-
-            parent_idx = self.dag_parent[current_node_idx]
-
-            while len(parent_idx) > 0:
-                all_ancestors.insert(0, self.hist[parent_idx[0]])
-                parent_idx = self.dag_parent[parent_idx[0]]
-
-        return all_ancestors
-
     def next_incomplete_component(
         self,
         search_type: Literal["all", "ancestors"] = "ancestors",
@@ -226,10 +179,6 @@ def experiment_and_feedback_list_after_init(
         Retrieve a list of experiments and feedbacks based on the return_type.
         """
         search_list = self.retrieve_search_list(search_type, selection=selection)
-        if max_retrieve_num is not None and len(search_list) > 0:
-            retrieve_num = min(max_retrieve_num, len(search_list))
-            search_list = search_list[:retrieve_num]
-
         final_component = self.COMPLETE_ORDER[-1]
         has_final_component = True if DS_RD_SETTING.coder_on_whole_pipeline else False
         SOTA_exp_and_feedback_list = []
@@ -243,6 +192,13 @@ def experiment_and_feedback_list_after_init(
                     failed_exp_and_feedback_list.append((exp, fb))
             if exp.hypothesis.component == final_component and fb:
                 has_final_component = True
+        if max_retrieve_num is not None and (SOTA_exp_and_feedback_list or failed_exp_and_feedback_list):
+            SOTA_exp_and_feedback_list = SOTA_exp_and_feedback_list[
+                -min(max_retrieve_num, len(SOTA_exp_and_feedback_list)) :
+            ]
+            failed_exp_and_feedback_list = failed_exp_and_feedback_list[
+                -min(max_retrieve_num, len(failed_exp_and_feedback_list)) :
+            ]
         if return_type == "all":
             return SOTA_exp_and_feedback_list + failed_exp_and_feedback_list
         elif return_type == "failed":

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/ckp_select.py b/rdagent/scenarios/data_science/proposal/exp_gen/ckp_select.py
@@ -56,7 +56,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
 
         Returns:
             (-1,): Continue with the current latest trial
-            (): Start a new sub-trace if max trace limit not reached
+            trace.NEW_ROOT: Start a new sub-trace if max trace limit not reached
         """
 
         if self.time_limit_pre_trace is None:
@@ -69,8 +69,8 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
             logger.info(f"Starting initial sub-trace {trace.sub_trace_count} at {current_time}")
             return (-1,)  # Continue with latest trial for new sub-trace
 
-        # Calculate elapsed time for current sub-trace
-        elapsed_time = current_time - self.sub_trace_start_times[trace.sub_trace_count - 1]
+        # Calculate elapsed time for current sub-trace, Trace count may be larger than MAX_TRACE_NUM druing merge process
+        elapsed_time = current_time - self.sub_trace_start_times[min(trace.sub_trace_count, self.MAX_TRACE_NUM) - 1]
 
         if elapsed_time < self.time_limit_pre_trace:
             # Continue with current sub-trace
@@ -94,7 +94,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
                 f"Elapsed time {elapsed_time} exceeds time limit {self.time_limit_pre_trace}, jump to a new sub-trace"
             )
             logger.info(f"current sub-trace count: {trace.sub_trace_count}")
-            return tuple()  # Empty tuple signals starting a new sub-trace
+            return trace.NEW_ROOT  # Empty tuple signals starting a new sub-trace
 
 
 class SOTAJumpCKPSelector(CheckpointSelector):
@@ -140,7 +140,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
                     f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump to a new sub-trace"
                 )
                 logger.info(f"current sub-trace count: {trace.sub_trace_count}")
-                return ()
+                return trace.NEW_ROOT
             else:
                 logger.info(
                     f"SOTA count {sota_count} is above threshold {self.SOTA_COUNT_THRESHOLD}, continue the current latest trial"
@@ -201,7 +201,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
                     logger.info(
                         f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump a new sub-trace"
                     )
-                    return ()  # reboot a new sub-trace
+                    return trace.NEW_ROOT  # reboot a new sub-trace
                 else:
                     logger.info(
                         f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump back to the last second SOTA in hist (may not in current sub-trace)"
@@ -227,7 +227,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
                             f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump a new sub-trace"
                         )
                         logger.info(f"current sub-trace count: {trace.sub_trace_count}")
-                        return ()  # reboot a new sub-trace
+                        return trace.NEW_ROOT  # reboot a new sub-trace
 
             else:
                 logger.info(