Skip to content

Commit d7e211c

Browse files
committed
several improvement on kaggle loop
1 parent 0949cf3 commit d7e211c

File tree

19 files changed

+300
-205
lines changed

19 files changed

+300
-205
lines changed

rdagent/app/kaggle/loop.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
import subprocess
2-
from collections import defaultdict
3-
from concurrent.futures import TimeoutError
42
from typing import Any
53

64
import fire
@@ -14,7 +12,6 @@
1412
Hypothesis2Experiment,
1513
HypothesisExperiment2Feedback,
1614
HypothesisGen,
17-
Trace,
1815
)
1916
from rdagent.core.scenario import Scenario
2017
from rdagent.core.utils import import_class
@@ -116,7 +113,7 @@ def running(self, prev_out: dict[str, Any]):
116113

117114
return exp
118115

119-
skip_loop_error = (ModelEmptyError, FactorEmptyError, TimeoutError)
116+
skip_loop_error = (ModelEmptyError, FactorEmptyError)
120117

121118

122119
def main(path=None, step_n=None, competition=None):

rdagent/components/coder/model_coder/CoSTEER/evaluators.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,10 @@ def evaluate(
283283
else:
284284
gt_np_array = None
285285

286-
shape_feedback, shape_decision = shape_evaluator(gen_np_array, (batch_size, 1))
286+
shape_feedback, shape_decision = shape_evaluator(
287+
gen_np_array,
288+
(batch_size, self.scen.model_output_channel if hasattr(self.scen, "model_output_channel") else 1),
289+
)
287290
value_feedback, value_decision = value_evaluator(gen_np_array, gt_np_array)
288291
code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
289292
target_task=target_task,

rdagent/components/knowledge_management/graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class UndirectedNode(Node):
2222
def __init__(self, content: str = "", label: str = "", embedding: Any = None) -> None:
2323
super().__init__(content, label, embedding)
2424
self.neighbors: set[UndirectedNode] = set()
25+
assert isinstance(content, str), "content must be a string"
2526

2627
def add_neighbor(self, node: UndirectedNode) -> None:
2728
self.neighbors.add(node)

rdagent/components/proposal/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def __init__(self, scen: Scenario):
7272
self.targets = "model tuning"
7373

7474

75-
class FactorAndModelHypothesisGen(FactorHypothesisGen):
75+
class FactorAndModelHypothesisGen(LLMHypothesisGen):
7676
def __init__(self, scen: Scenario):
7777
super().__init__(scen)
7878
self.targets = "feature engineering and model building"

rdagent/components/runner/__init__.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,5 @@ def get_cache_key(self, exp: Experiment) -> str:
2121
def assign_cached_result(self, exp: Experiment, cached_res: Experiment) -> Experiment:
2222
if exp.based_experiments and exp.based_experiments[-1].result is None:
2323
exp.based_experiments[-1].result = cached_res.based_experiments[-1].result
24-
if cached_res.experiment_workspace.workspace_path.exists():
25-
for csv_file in cached_res.experiment_workspace.workspace_path.glob("*.csv"):
26-
shutil.copy(csv_file, exp.experiment_workspace.workspace_path)
27-
for py_file in (cached_res.experiment_workspace.workspace_path / "feature").glob("*.py"):
28-
shutil.copy(py_file, exp.experiment_workspace.workspace_path / "feature")
29-
for py_file in (cached_res.experiment_workspace.workspace_path / "model").glob("*.py"):
30-
shutil.copy(py_file, exp.experiment_workspace.workspace_path / "model")
3124
exp.result = cached_res.result
3225
return exp

rdagent/core/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def multiprocessing_wrapper(func_calls: list[tuple[Callable, tuple]], n: int) ->
142142
list
143143
144144
"""
145-
if n == 1:
145+
if n == 1 or max(1, min(n, len(func_calls))) == 1:
146146
return [f(*args) for f, args in func_calls]
147147

148148
with mp.Pool(processes=max(1, min(n, len(func_calls)))) as pool:

rdagent/log/ui/app.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def summary_window():
409409
if state.alpha158_metrics is not None:
410410
selected = ["alpha158"] + [i for i in df.index if state.h_decisions[int(i[6:])]]
411411
else:
412-
selected = [i for i in df.index if state.h_decisions[int(i[6:])]]
412+
selected = [i for i in df.index if i == "Baseline" or state.h_decisions[int(i[6:])]]
413413
df = df.loc[selected]
414414
if df.shape[0] == 1:
415415
st.table(df.iloc[0])
@@ -637,6 +637,7 @@ def evolving_window():
637637
for j, w in enumerate(ws):
638638
with wtabs[j]:
639639
# Evolving Code
640+
st.markdown(f"**Workspace Path**: {w.workspace_path}")
640641
for k, v in w.code_dict.items():
641642
with st.expander(f":green[`{k}`]", expanded=True):
642643
st.code(v, language="python")
@@ -681,6 +682,7 @@ def evolving_window():
681682
st.text_input("log path", key="log_path", on_change=refresh, label_visibility="collapsed")
682683
else:
683684
folders = [folder.relative_to(main_log_path) for folder in main_log_path.iterdir() if folder.is_dir()]
685+
folders = sorted(folders, key=lambda x: x.name)
684686
st.selectbox(f"**Select from `{main_log_path}`**", folders, key="log_path", on_change=refresh)
685687
else:
686688
st.text_input(":blue[**log path**]", key="log_path", on_change=refresh)

rdagent/scenarios/kaggle/developer/feedback.py

Lines changed: 57 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55
from jinja2 import Environment, StrictUndefined
66

7+
from rdagent.components.knowledge_management.graph import UndirectedNode
78
from rdagent.core.experiment import Experiment
89
from rdagent.core.prompts import Prompts
910
from rdagent.core.proposal import (
@@ -14,6 +15,7 @@
1415
)
1516
from rdagent.log import rdagent_logger as logger
1617
from rdagent.oai.llm_utils import APIBackend
18+
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KG_SELECT_MAPPING
1719
from rdagent.utils import convert2bool
1820

1921
prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
@@ -59,17 +61,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
5961
Any: The feedback generated for the given experiment and hypothesis.
6062
"""
6163
logger.info("Generating feedback...")
62-
hypothesis_text = hypothesis.hypothesis
6364
current_result = exp.result
64-
tasks_factors = []
65-
if exp.sub_tasks:
66-
tasks_factors = []
67-
for task in exp.sub_tasks:
68-
try:
69-
task_info = task.get_task_information_and_implementation_result()
70-
tasks_factors.append(task_info)
71-
except AttributeError:
72-
print(f"Warning: Task {task} does not have get_task_information_and_implementation_result method")
7365

7466
evaluation_description = None
7567
# Check if there are any based experiments
@@ -84,11 +76,6 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
8476
) # Compare with itself
8577
print("Warning: No previous experiments to compare against. Using current result as baseline.")
8678

87-
available_features = {
88-
task_info: feature_shape for task_info, feature_shape in exp.experiment_workspace.data_description
89-
}
90-
model_code = exp.experiment_workspace.model_description
91-
9279
# Generate the user prompt based on the action type
9380
if hypothesis.action == "Model tuning":
9481
prompt_key = "model_tuning_feedback_generation"
@@ -104,35 +91,52 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
10491
.render(scenario=self.scen.get_scenario_all_desc(filtered_tag="feedback"))
10592
)
10693

107-
last_task_and_code = None
108-
if trace.hist:
109-
last_task_and_code = (
110-
trace.hist[-1][1].experiment_workspace.data_description
111-
if trace.hist[-1][0].action == "Feature engineering" or trace.hist[-1][0].action == "Feature processing"
112-
else trace.hist[-1][1].experiment_workspace.model_description
113-
)
94+
sota_exp = exp.based_experiments[-1] if exp.based_experiments else None
95+
assert sota_exp is not None
96+
sota_features = str(exp.based_experiments[-1].experiment_workspace.data_description)
97+
sota_models = json.dumps(exp.based_experiments[-1].experiment_workspace.model_description, indent=2)
98+
sota_result = exp.based_experiments[-1].result
99+
100+
current_hypothesis = hypothesis.hypothesis
101+
current_hypothesis_reason = hypothesis.reason
102+
current_target_action = hypothesis.action
103+
current_sub_exps_to_code = {}
104+
if hypothesis.action == "Model tuning":
105+
current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
106+
elif hypothesis.action == "Model feature selection":
107+
current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
108+
KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
109+
]
110+
else:
111+
current_sub_exps_to_code = {
112+
sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
113+
}
114+
current_sub_exps_to_code_str = json.dumps(current_sub_exps_to_code, indent=2)
115+
current_result = exp.result
116+
117+
last_hypothesis_and_feedback = None
118+
if trace.hist and len(trace.hist) > 0:
119+
last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
114120

115121
# Prepare render dictionary
116122
render_dict = {
117-
"last_hypothesis": trace.hist[-1][0] if trace.hist else None,
118-
"last_task_and_code": last_task_and_code,
119-
"last_result": trace.hist[-1][1].result if trace.hist else None,
120-
"sota_task_and_code": (
121-
exp.based_experiments[-1].experiment_workspace.data_description if exp.based_experiments else None
122-
),
123-
"sota_result": exp.based_experiments[-1].result if exp.based_experiments else None,
124-
"hypothesis": hypothesis,
125-
"exp": exp,
126-
"model_code": model_code, # This turn
127-
"available_features": available_features, # This turn
128-
"combined_result": combined_result, # This turn and sota
129-
"hypothesis_text": hypothesis_text, # This turn
130-
"task_details": tasks_factors, # This turn
123+
"sota_features": sota_features,
124+
"sota_models": sota_models,
125+
"sota_result": sota_result,
126+
"current_hypothesis": current_hypothesis,
127+
"current_hypothesis_reason": current_hypothesis_reason,
128+
"current_target_action": current_target_action,
129+
"current_sub_exps_to_code": current_sub_exps_to_code_str,
130+
"current_result": current_result,
131+
"combined_result": combined_result,
131132
"evaluation_description": evaluation_description,
133+
"last_hypothesis_and_feedback": last_hypothesis_and_feedback,
132134
}
133135

134136
usr_prompt = (
135-
Environment(undefined=StrictUndefined).from_string(prompt_dict[prompt_key]["user"]).render(**render_dict)
137+
Environment(undefined=StrictUndefined)
138+
.from_string(prompt_dict["kg_feedback_generation_user"])
139+
.render(**render_dict)
136140
)
137141

138142
response = APIBackend().build_messages_and_create_chat_completion(
@@ -160,22 +164,29 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
160164
percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
161165

162166
experiment_feedback = {
163-
"current_competition": self.scen.get_competition_full_desc(),
164-
"hypothesis_text": hypothesis_text,
167+
"hypothesis_text": current_hypothesis,
168+
"tasks_factors": current_sub_exps_to_code,
165169
"current_result": current_result,
166-
"model_code": model_code,
167-
"available_features": available_features,
168-
"observations": observations,
169-
"hypothesis_evaluation": hypothesis_evaluation,
170-
"reason": reason,
171-
"percentile_ranking": percentile_ranking,
172170
}
173171

174172
if self.scen.if_using_vector_rag:
173+
raise NotImplementedError("Vector RAG is not implemented yet since there are plenty bugs!")
175174
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
176175
self.scen.vector_base.dump()
177176
elif self.scen.if_using_graph_rag:
178-
trace.knowledge_base.add_document(experiment_feedback, self.scen)
177+
competition_node = UndirectedNode(content=self.scen.get_competition_full_desc(), label="competition")
178+
hypothesis_node = UndirectedNode(content=hypothesis.hypothesis, label=hypothesis.action)
179+
exp_code_nodes = []
180+
for exp, code in current_sub_exps_to_code.items():
181+
exp_code_nodes.append(UndirectedNode(content=exp, label="experiments"))
182+
if code != "":
183+
exp_code_nodes.append(UndirectedNode(content=code, label="code"))
184+
conclusion_node = UndirectedNode(content=response, label="conclusion")
185+
all_nodes = [competition_node, hypothesis_node, *exp_code_nodes, conclusion_node]
186+
all_nodes = trace.knowledge_base.batch_embedding(all_nodes)
187+
for node in all_nodes:
188+
if node is not competition_node:
189+
trace.knowledge_base.add_node(node, competition_node)
179190

180191
if self.scen.if_action_choosing_based_on_UCB:
181192
self.scen.action_counts[hypothesis.action] += 1

rdagent/scenarios/kaggle/developer/runner.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from rdagent.components.runner import CachedRunner
77
from rdagent.core.exception import CoderError, FactorEmptyError, ModelEmptyError
8-
from rdagent.core.experiment import ASpecificExp
8+
from rdagent.core.experiment import ASpecificExp, Experiment
99
from rdagent.core.prompts import Prompts
1010
from rdagent.core.utils import cache_with_pickle
1111
from rdagent.oai.llm_utils import md5_hash
@@ -28,6 +28,18 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
2828
cached_key_from_exp = CachedRunner.get_cache_key(self, exp)
2929
return md5_hash(codes + cached_key_from_exp)
3030

31+
def assign_cached_result(self, exp: Experiment, cached_res: Experiment) -> Experiment:
32+
exp = CachedRunner.assign_cached_result(self, exp, cached_res)
33+
if cached_res.experiment_workspace.workspace_path.exists():
34+
for csv_file in cached_res.experiment_workspace.workspace_path.glob("*.csv"):
35+
shutil.copy(csv_file, exp.experiment_workspace.workspace_path)
36+
for py_file in (cached_res.experiment_workspace.workspace_path / "feature").glob("*.py"):
37+
shutil.copy(py_file, exp.experiment_workspace.workspace_path / "feature")
38+
for py_file in (cached_res.experiment_workspace.workspace_path / "model").glob("*.py"):
39+
shutil.copy(py_file, exp.experiment_workspace.workspace_path / "model")
40+
exp.experiment_workspace.data_description = cached_res.experiment_workspace.data_description
41+
return exp
42+
3143
@cache_with_pickle(get_cache_key, CachedRunner.assign_cached_result)
3244
def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
3345
"""
@@ -79,10 +91,13 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
7991
for sub_ws in exp.sub_workspace_list:
8092
if sub_ws.code_dict == {}:
8193
continue
94+
execued_df = sub_ws.execute()[1]
95+
if execued_df is None:
96+
continue
8297
implemented_factor_count += 1
8398
target_feature_file_name = f"feature/feature_{current_feature_file_count:05d}.py"
8499
exp.experiment_workspace.inject_code(**{target_feature_file_name: sub_ws.code_dict["factor.py"]})
85-
feature_shape = sub_ws.execute()[1].shape[-1]
100+
feature_shape = execued_df.shape[-1]
86101
exp.experiment_workspace.data_description.append((sub_ws.target_task.get_task_information(), feature_shape))
87102
current_feature_file_count += 1
88103
if implemented_factor_count == 0:

rdagent/scenarios/kaggle/docker/Dockerfile renamed to rdagent/scenarios/kaggle/docker/kaggle_docker/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@ RUN python -m pip install numpy
1717
RUN python -m pip install pandas
1818
# RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster -f https://data.pyg.org/whl/torch-2.3.0%2Bcu121.html
1919
RUN pip install torch_geometric
20+
RUN pip install pytorch_lightning
2021
RUN pip install ogb
2122
RUN pip install networkx
2223
RUN pip install scikit-learn
2324
RUN pip install catboost
2425
RUN pip install xgboost
2526
RUN pip install sparse
26-
RUN pip install lightgbm
27+
RUN pip install lightgbm==3.3.5
2728
RUN pip install pyarrow
2829
RUN pip install fastparquet
2930
RUN pip install optuna

0 commit comments

Comments
 (0)