Skip to content
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0da5fa5
chore: avoid incorporate changes
qew21 May 9, 2025
a6d0594
chore: align to previous code
qew21 May 29, 2025
52bf6d2
chore: restore v2 prompt
qew21 May 29, 2025
033cfe4
fix: trace without selection
qew21 May 29, 2025
f39e7a9
fix: without selection
qew21 May 30, 2025
3cdce0e
chore: update collect_all_ancestors
qew21 May 30, 2025
9842455
fix: remove params trace
qew21 May 30, 2025
a28b1ee
fix: no dag_parent
qew21 May 30, 2025
1546167
chore: refactoring
qew21 May 30, 2025
513c00b
add trace.NEW_ROOT
qew21 May 30, 2025
57d4190
reformat
qew21 May 30, 2025
b370c6e
fix: no scen_prob_multiplier
qew21 May 30, 2025
22fb6bf
add type annotation
qew21 May 30, 2025
9043834
refactoring
qew21 May 30, 2025
c0b1e35
sub_trace_count for merge v2
qew21 May 30, 2025
567e554
fix: no problem key
qew21 May 31, 2025
5386958
fix: use regex with timeout
qew21 May 31, 2025
cd156a3
fix: sota_exp_fb is None
qew21 May 31, 2025
f986d04
chore: remove redundant code
qew21 Jun 3, 2025
e9d59ba
Update rdagent/scenarios/data_science/proposal/exp_gen/base.py
you-n-g Jun 3, 2025
e78d324
refactor regex sub
qew21 Jun 3, 2025
c28cb5d
chore: hypothesis_rank with selected_idx
qew21 Jun 3, 2025
181d0f0
chore: check sota in selection
qew21 Jun 3, 2025
6b92bed
update annotation
qew21 Jun 3, 2025
6412818
refactor: add parent/idx mapping API to Trace and streamline merge logic
you-n-g Jun 3, 2025
a09dfdc
chore: define is_parent in proposal
qew21 Jun 3, 2025
93fde93
fix ci
qew21 Jun 3, 2025
0c7bb27
fix ci
qew21 Jun 3, 2025
70458d8
fix ci
qew21 Jun 3, 2025
21cef4d
fix ci
qew21 Jun 3, 2025
a38a812
refactoring
qew21 Jun 3, 2025
f028c10
fix ci
qew21 Jun 3, 2025
cd6fbd4
chore: rename collect_all_ancestors to get_parent_exps
qew21 Jun 4, 2025
1a45015
raise error when exp not in hist
qew21 Jun 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 78 additions & 3 deletions rdagent/core/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Generic, TypeVar
from typing import Generic, List, Tuple, TypeVar

from rdagent.core.evaluation import Feedback
from rdagent.core.experiment import ASpecificExp, Experiment
from rdagent.core.knowledge_base import KnowledgeBase
from rdagent.core.scenario import Scenario

# class data_ana: XXX


class Hypothesis:
"""
Expand Down Expand Up @@ -105,6 +103,7 @@ def __str__(self) -> str:

class Trace(Generic[ASpecificScen, ASpecificKB]):
NodeType = tuple[Experiment, ExperimentFeedback] # Define NodeType as a new type representing the tuple
NEW_ROOT: Tuple = ()

def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
self.scen: ASpecificScen = scen
Expand All @@ -116,6 +115,7 @@ def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = Non

# TODO: self.hist is 2-tuple now, remove hypothesis from it, change old code for this later.
self.knowledge_base: ASpecificKB | None = knowledge_base
self.current_selection: tuple[int, ...] = (-1,)

def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
"""Access the last experiment result, sub-task, and the corresponding hypothesis."""
Expand All @@ -126,6 +126,81 @@ def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experim

return None, None

def is_selection_new_tree(self, selection: tuple[int, ...] | None = None) -> bool:
"""
Check if the current trace is a new tree.
- selection maybe (-1,) when the dag_parent is empty.
"""
if selection is None:
selection = self.get_current_selection()

return selection == self.NEW_ROOT or len(self.dag_parent) == 0

def get_current_selection(self) -> tuple[int, ...]:
return self.current_selection

def set_current_selection(self, selection: tuple[int, ...]) -> None:
self.current_selection = selection

def collect_all_ancestors(
self,
selection: tuple[int, ...] | None = None,
) -> list[Trace.NodeType]:
"""
Collect all ancestors of the given selection.
The return list follows the order of [root->...->parent->current_node].
"""
if selection is None:
selection = self.get_current_selection()

if self.is_selection_new_tree(selection):
return []

else:
all_ancestors: list[Trace.NodeType] = []

# start from the latest selection
current_node_idx = selection[0]

# add the current node to the list
all_ancestors.insert(0, self.hist[current_node_idx])

parent_idx = self.dag_parent[current_node_idx]

while len(parent_idx) > 0:
all_ancestors.insert(0, self.hist[parent_idx[0]])
parent_idx = self.dag_parent[parent_idx[0]]

return all_ancestors

def exp2idx(self, exp: Experiment | List[Experiment]) -> int | List[int] | None:
if isinstance(exp, list):
exps: List[Experiment] = exp
return [i for i, (_exp, _) in enumerate(self.hist) if _exp in exps]
else:
for i, (_exp, _) in enumerate(self.hist):
if _exp == exp:
return i
return None

def idx2exp(self, idx: int | List[int]) -> Experiment | List[Experiment]:
if isinstance(idx, list):
idxs: List[int] = idx
return [self.hist[_idx][0] for _idx in idxs]
else:
return self.hist[idx][0]

def is_parent(self, parent_idx: int, child_idx: int) -> bool:
ancestors = self.get_parents(child_idx)
return parent_idx in ancestors

def get_parents(self, child_idx: int) -> List[int]:
ancestors = self.collect_all_ancestors((child_idx,))
ancestor_exps = [exp for exp, _ in ancestors]
parent_idxs = self.exp2idx(ancestor_exps)
assert isinstance(parent_idxs, list)
return parent_idxs


class CheckpointSelector:
"""
Expand Down
60 changes: 8 additions & 52 deletions rdagent/scenarios/data_science/proposal/exp_gen/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import abstractmethod
from typing import Literal
from typing import List, Literal

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.core.evolving_framework import KnowledgeBase
Expand Down Expand Up @@ -61,21 +61,13 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =

self.knowledge_base = knowledge_base

self.current_selection: tuple[int, ...] = (-1,)

self.sota_exp_to_submit: DSExperiment | None = None # grab the global best exp to submit

COMPLETE_ORDER = ("DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow")

def set_sota_exp_to_submit(self, exp: DSExperiment) -> None:
self.sota_exp_to_submit = exp

def get_current_selection(self) -> tuple[int, ...]:
return self.current_selection

def set_current_selection(self, selection: tuple[int, ...]) -> None:
self.current_selection = selection

@property
def sub_trace_count(self) -> int:
return len(self.get_leaves())
Expand Down Expand Up @@ -144,50 +136,11 @@ def retrieve_search_list(
return self.hist

elif search_type == "ancestors":

if selection is None:
selection = self.get_current_selection()

if len(selection) == 0:
# selection is (), which means we switch to a new trace
return []

return self.collect_all_ancestors(selection)

else:
raise ValueError(f"Invalid search type: {search_type}")

def collect_all_ancestors(
self,
selection: tuple[int, ...] | None = None,
) -> list[tuple[DSExperiment, ExperimentFeedback]]:
"""
Collect all ancestors of the given selection.
The return list follows the order of [root->...->parent->current_node].
"""
if selection is None:
selection = self.get_current_selection()

if len(self.dag_parent) == 0:
return []

else:
all_ancestors = []

# start from the latest selection
current_node_idx = selection[0]

# add the current node to the list
all_ancestors.insert(0, self.hist[current_node_idx])

parent_idx = self.dag_parent[current_node_idx]

while len(parent_idx) > 0:
all_ancestors.insert(0, self.hist[parent_idx[0]])
parent_idx = self.dag_parent[parent_idx[0]]

return all_ancestors

def next_incomplete_component(
self,
search_type: Literal["all", "ancestors"] = "ancestors",
Expand Down Expand Up @@ -226,10 +179,6 @@ def experiment_and_feedback_list_after_init(
Retrieve a list of experiments and feedbacks based on the return_type.
"""
search_list = self.retrieve_search_list(search_type, selection=selection)
if max_retrieve_num is not None and len(search_list) > 0:
retrieve_num = min(max_retrieve_num, len(search_list))
search_list = search_list[:retrieve_num]

final_component = self.COMPLETE_ORDER[-1]
has_final_component = True if DS_RD_SETTING.coder_on_whole_pipeline else False
SOTA_exp_and_feedback_list = []
Expand All @@ -243,6 +192,13 @@ def experiment_and_feedback_list_after_init(
failed_exp_and_feedback_list.append((exp, fb))
if exp.hypothesis.component == final_component and fb:
has_final_component = True
if max_retrieve_num is not None and (SOTA_exp_and_feedback_list or failed_exp_and_feedback_list):
SOTA_exp_and_feedback_list = SOTA_exp_and_feedback_list[
-min(max_retrieve_num, len(SOTA_exp_and_feedback_list)) :
]
failed_exp_and_feedback_list = failed_exp_and_feedback_list[
-min(max_retrieve_num, len(failed_exp_and_feedback_list)) :
]
if return_type == "all":
return SOTA_exp_and_feedback_list + failed_exp_and_feedback_list
elif return_type == "failed":
Expand Down
14 changes: 7 additions & 7 deletions rdagent/scenarios/data_science/proposal/exp_gen/ckp_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:

Returns:
(-1,): Continue with the current latest trial
(): Start a new sub-trace if max trace limit not reached
trace.NEW_ROOT: Start a new sub-trace if max trace limit not reached
"""

if self.time_limit_pre_trace is None:
Expand All @@ -69,8 +69,8 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
logger.info(f"Starting initial sub-trace {trace.sub_trace_count} at {current_time}")
return (-1,) # Continue with latest trial for new sub-trace

# Calculate elapsed time for current sub-trace
elapsed_time = current_time - self.sub_trace_start_times[trace.sub_trace_count - 1]
# Calculate elapsed time for current sub-trace, Trace count may be larger than MAX_TRACE_NUM druing merge process
elapsed_time = current_time - self.sub_trace_start_times[min(trace.sub_trace_count, self.MAX_TRACE_NUM) - 1]

if elapsed_time < self.time_limit_pre_trace:
# Continue with current sub-trace
Expand All @@ -94,7 +94,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
f"Elapsed time {elapsed_time} exceeds time limit {self.time_limit_pre_trace}, jump to a new sub-trace"
)
logger.info(f"current sub-trace count: {trace.sub_trace_count}")
return tuple() # Empty tuple signals starting a new sub-trace
return trace.NEW_ROOT # Empty tuple signals starting a new sub-trace


class SOTAJumpCKPSelector(CheckpointSelector):
Expand Down Expand Up @@ -140,7 +140,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump to a new sub-trace"
)
logger.info(f"current sub-trace count: {trace.sub_trace_count}")
return ()
return trace.NEW_ROOT
else:
logger.info(
f"SOTA count {sota_count} is above threshold {self.SOTA_COUNT_THRESHOLD}, continue the current latest trial"
Expand Down Expand Up @@ -201,7 +201,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
logger.info(
f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump a new sub-trace"
)
return () # reboot a new sub-trace
return trace.NEW_ROOT # reboot a new sub-trace
else:
logger.info(
f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump back to the last second SOTA in hist (may not in current sub-trace)"
Expand All @@ -227,7 +227,7 @@ def get_selection(self, trace: Trace) -> tuple[int, ...]:
f"SOTA count {sota_count} is below threshold {self.SOTA_COUNT_THRESHOLD}, jump a new sub-trace"
)
logger.info(f"current sub-trace count: {trace.sub_trace_count}")
return () # reboot a new sub-trace
return trace.NEW_ROOT # reboot a new sub-trace

else:
logger.info(
Expand Down
Loading