Skip to content

Commit 4cf22a6

Browse files
peteryang1you-n-gtaozhiwangxisen-wHytn
authored
feat: Kaggle loop update (Feature & Model) (microsoft#241)
* Init todo * Evaluation & dataset * Generate new data * dataset generation * add the result * Analysis * Factor update * Updates * Reformat analysis.py * CI fix * Revised Preprocessing & Supported Random Forest * Revised to support three models with feature * Further revised prompts * Slight Revision * docs: update contributors (microsoft#230) * Revised to support three models with feature * Further revised prompts * Slight Revision * feat: kaggle model and feature (microsoft#238) * update first version code * make hypothesis_gen and experiment_builder fit for both feature and model * feat: continue kaggle feature and model coder (microsoft#239) * use qlib docker to run qlib models * feature coder ready * model coder ready * fix CI * finish the first round of runner (microsoft#240) * Optimized the factor scenario and added the front-end. * fix a small bug * fix a typo * update the kaggle scenario * delete model_template folder * use experiment to run data preprocess script * add source data to scenarios * minor fix * minor bug fix * train.py debug * fixed a bug in train.py and added some TODOs * For Debugging * fix two small bugs in based_exp * fix some bugs * update preprocess * fix a bug in preprocess * fix a bug in train.py * reformat * Follow-up * fix a bug in train.py * fix a bug in workspace * fix a bug in feature duplication * fix a bug in feedback * fix a bug in preprocessed data * fix a bug om feature engineering * fix a ci error * Debugged & Connected * Fixed error on feedback & added other fixes * fix CI errors * fix a CI bug * fix: fix_dotenv_error (microsoft#257) * fix_dotenv_error * format with isort * Update rdagent/app/cli.py --------- Co-authored-by: you-n-g <you-n-g@users.noreply.github.com> * chore(main): release 0.2.1 (microsoft#249) Release-As: 0.2.1 * init a scenario for kaggle feature engineering * delete error codes * Delete rdagent/app/kaggle_feature/conf.py --------- Co-authored-by: Young <afe.young@gmail.com> Co-authored-by: Taozhi Wang <taozhi.mark.wang@gmail.com> Co-authored-by: you-n-g <you-n-g@users.noreply.github.com> Co-authored-by: cyncyw <47289405+taozhiwang@users.noreply.github.com> Co-authored-by: Xisen-Wang <xisen_application@163.com> Co-authored-by: Haotian Chen <113661982+Hytn@users.noreply.github.com> Co-authored-by: WinstonLiye <1957922024@qq.com> Co-authored-by: WinstonLiyt <104308117+WinstonLiyt@users.noreply.github.com> Co-authored-by: Linlang <30293408+SunsetWolf@users.noreply.github.com>
1 parent 44031d5 commit 4cf22a6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1518
-686
lines changed

rdagent/app/kaggle/conf.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,33 @@ class Config:
1313
"""Add 'model_' to the protected namespaces"""
1414

1515
# 1) overriding the default
16-
scen: str = "rdagent.scenarios.kaggle.experiment.model_experiment.KGModelScenario"
16+
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
1717
"""Scenario class for data mining model"""
1818

19-
hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesisGen"
19+
hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
2020
"""Hypothesis generation class"""
2121

22-
hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesis2Experiment"
22+
hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
2323
"""Hypothesis to experiment class"""
2424

25-
coder: str = "rdagent.scenarios.kaggle.developer.model_coder.KGModelCoSTEER"
26-
"""Coder class"""
25+
feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
26+
"""Feature Coder class"""
2727

28-
runner: str = "rdagent.scenarios.kaggle.developer.model_runner.KGModelRunner"
29-
"""Runner class"""
28+
model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
29+
"""Model Coder class"""
3030

31-
summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGModelHypothesisExperiment2Feedback"
31+
feature_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGFactorRunner"
32+
"""Feature Runner class"""
33+
34+
model_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGModelRunner"
35+
"""Model Runner class"""
36+
37+
summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGHypothesisExperiment2Feedback"
3238
"""Summarizer class"""
3339

3440
evolving_n: int = 10
3541
"""Number of evolutions"""
3642

37-
evolving_n: int = 10
38-
3943
competition: str = ""
4044

4145

rdagent/app/kaggle/loop.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from collections import defaultdict
2+
from typing import Any
3+
4+
import fire
5+
6+
from rdagent.app.kaggle.conf import PROP_SETTING
7+
from rdagent.components.workflow.conf import BasePropSetting
8+
from rdagent.components.workflow.rd_loop import RDLoop
9+
from rdagent.core.developer import Developer
10+
from rdagent.core.exception import ModelEmptyError
11+
from rdagent.core.proposal import (
12+
Hypothesis2Experiment,
13+
HypothesisExperiment2Feedback,
14+
HypothesisGen,
15+
Trace,
16+
)
17+
from rdagent.core.scenario import Scenario
18+
from rdagent.core.utils import import_class
19+
from rdagent.log import rdagent_logger as logger
20+
from rdagent.scenarios.kaggle.proposal.proposal import (
21+
KG_ACTION_FEATURE_ENGINEERING,
22+
KG_ACTION_FEATURE_PROCESSING,
23+
)
24+
25+
26+
class ModelRDLoop(RDLoop):
27+
def __init__(self, PROP_SETTING: BasePropSetting):
28+
with logger.tag("init"):
29+
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
30+
logger.log_object(scen, tag="scenario")
31+
32+
self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
33+
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
34+
35+
self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
36+
logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
37+
38+
self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
39+
logger.log_object(self.feature_coder, tag="feature coder")
40+
self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
41+
logger.log_object(self.model_coder, tag="model coder")
42+
43+
self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
44+
logger.log_object(self.feature_runner, tag="feature runner")
45+
self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
46+
logger.log_object(self.model_runner, tag="model runner")
47+
48+
self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
49+
logger.log_object(self.summarizer, tag="summarizer")
50+
self.trace = Trace(scen=scen)
51+
super(RDLoop, self).__init__()
52+
53+
def coding(self, prev_out: dict[str, Any]):
54+
with logger.tag("d"): # develop
55+
if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
56+
exp = self.feature_coder.develop(prev_out["exp_gen"])
57+
else:
58+
exp = self.model_coder.develop(prev_out["exp_gen"])
59+
logger.log_object(exp.sub_workspace_list, tag="coder result")
60+
return exp
61+
62+
def running(self, prev_out: dict[str, Any]):
63+
with logger.tag("ef"): # evaluate and feedback
64+
if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
65+
exp = self.feature_runner.develop(prev_out["coding"])
66+
else:
67+
exp = self.model_runner.develop(prev_out["coding"])
68+
logger.log_object(exp, tag="runner result")
69+
return exp
70+
71+
skip_loop_error = (ModelEmptyError,)
72+
73+
74+
def main(path=None, step_n=None, competition=None):
75+
"""
76+
Auto R&D Evolving loop for models in a kaggle{} scenario.
77+
78+
You can continue running session by
79+
80+
.. code-block:: python
81+
82+
dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose --step_n 1 # `step_n` is a optional paramter
83+
84+
"""
85+
if competition:
86+
PROP_SETTING.competition = competition
87+
if path is None:
88+
model_loop = ModelRDLoop(PROP_SETTING)
89+
else:
90+
model_loop = ModelRDLoop.load(path)
91+
model_loop.run(step_n=step_n)
92+
93+
94+
if __name__ == "__main__":
95+
from dotenv import load_dotenv
96+
97+
load_dotenv(override=True)
98+
fire.Fire(main)

rdagent/components/coder/factor_coder/CoSTEER/evaluators.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def evaluate(
161161
)
162162
buffer = io.StringIO()
163163
gen_df.info(buf=buffer)
164-
gen_df_info_str = buffer.getvalue()
164+
gen_df_info_str = f"The use is currently working on a feature related task.\nThe output dataframe info is:\n{buffer.getvalue()}"
165165
system_prompt = (
166166
Environment(undefined=StrictUndefined)
167167
.from_string(
@@ -378,6 +378,7 @@ def evaluate(
378378
self,
379379
implementation: Workspace,
380380
gt_implementation: Workspace,
381+
version: int = 1, # 1 for qlib factors and 2 for kaggle factors
381382
**kwargs,
382383
) -> Tuple:
383384
conclusions = []
@@ -389,18 +390,21 @@ def evaluate(
389390
equal_value_ratio_result = 0
390391
high_correlation_result = False
391392

392-
# Check if both dataframe has only one columns
393-
feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
394-
conclusions.append(feedback_str)
393+
# Check if both dataframe has only one columns Mute this since factor task might generate more than one columns now
394+
if version == 1:
395+
feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
396+
conclusions.append(feedback_str)
395397

396398
# Check if the index of the dataframe is ("datetime", "instrument")
397399
feedback_str, _ = FactorOutputFormatEvaluator(self.scen).evaluate(implementation, gt_implementation)
398400
conclusions.append(feedback_str)
399-
400-
feedback_str, daily_check_result = FactorDatetimeDailyEvaluator(self.scen).evaluate(
401-
implementation, gt_implementation
402-
)
403-
conclusions.append(feedback_str)
401+
if version == 1:
402+
feedback_str, daily_check_result = FactorDatetimeDailyEvaluator(self.scen).evaluate(
403+
implementation, gt_implementation
404+
)
405+
conclusions.append(feedback_str)
406+
else:
407+
daily_check_result = None
404408

405409
# Check if both dataframe have the same rows count
406410
if gt_implementation is not None:
@@ -627,7 +631,9 @@ def evaluate(
627631
(
628632
factor_feedback.factor_value_feedback,
629633
decision_from_value_check,
630-
) = self.value_evaluator.evaluate(implementation=implementation, gt_implementation=gt_implementation)
634+
) = self.value_evaluator.evaluate(
635+
implementation=implementation, gt_implementation=gt_implementation, version=target_task.version
636+
)
631637

632638
factor_feedback.final_decision_based_on_gt = gt_implementation is not None
633639

@@ -647,7 +653,7 @@ def evaluate(
647653
target_task=target_task,
648654
implementation=implementation,
649655
execution_feedback=factor_feedback.execution_feedback,
650-
value_feedback=factor_feedback.factor_value_feedback,
656+
factor_value_feedback=factor_feedback.factor_value_feedback,
651657
gt_implementation=gt_implementation,
652658
)
653659
(

rdagent/components/coder/factor_coder/factor.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,19 @@ def __init__(
2424
factor_name,
2525
factor_description,
2626
factor_formulation,
27+
*args,
2728
variables: dict = {},
2829
resource: str = None,
2930
factor_implementation: bool = False,
31+
**kwargs,
3032
) -> None:
3133
self.factor_name = factor_name
3234
self.factor_description = factor_description
3335
self.factor_formulation = factor_formulation
3436
self.variables = variables
3537
self.factor_resources = resource
3638
self.factor_implementation = factor_implementation
39+
super().__init__(*args, **kwargs)
3740

3841
def get_task_information(self):
3942
return f"""factor_name: {self.factor_name}
@@ -75,8 +78,8 @@ class FactorFBWorkspace(FBWorkspace):
7578
def __init__(
7679
self,
7780
*args,
78-
executed_factor_value_dataframe=None,
79-
raise_exception=False,
81+
executed_factor_value_dataframe: pd.DataFrame = None,
82+
raise_exception: bool = False,
8083
**kwargs,
8184
) -> None:
8285
super().__init__(*args, **kwargs)
@@ -102,7 +105,10 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
102105
1. make the directory in workspace path
103106
2. write the code to the file in the workspace path
104107
3. link all the source data to the workspace path folder
105-
4. execute the code
108+
if call_factor_py is True:
109+
4. execute the code
110+
else:
111+
4. generate a script from template to import the factor.py dump get the factor value to result.h5
106112
5. read the factor value from the output file in the workspace path folder
107113
returns the execution feedback as a string and the factor value as a pandas dataframe
108114
@@ -130,15 +136,21 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
130136
if self.executed_factor_value_dataframe is not None:
131137
return self.FB_FROM_CACHE, self.executed_factor_value_dataframe
132138

133-
source_data_path = (
134-
Path(
135-
FACTOR_IMPLEMENT_SETTINGS.data_folder_debug,
139+
if self.target_task.version == 1:
140+
source_data_path = (
141+
Path(
142+
FACTOR_IMPLEMENT_SETTINGS.data_folder_debug,
143+
)
144+
if data_type == "Debug"
145+
else Path(
146+
FACTOR_IMPLEMENT_SETTINGS.data_folder,
147+
)
136148
)
137-
if data_type == "Debug"
138-
else Path(
149+
elif self.target_task.version == 2:
150+
# TODO you can change the name of the data folder for a better understanding
151+
source_data_path = Path(
139152
FACTOR_IMPLEMENT_SETTINGS.data_folder,
140153
)
141-
)
142154

143155
source_data_path.mkdir(exist_ok=True, parents=True)
144156
code_path = self.workspace_path / f"factor.py"
@@ -147,9 +159,16 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
147159

148160
execution_feedback = self.FB_EXECUTION_SUCCEEDED
149161
execution_success = False
162+
163+
if self.target_task.version == 1:
164+
execution_code_path = code_path
165+
elif self.target_task.version == 2:
166+
execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py"
167+
execution_code_path.write_text((Path(__file__).parent / "factor_execution_template.txt").read_text())
168+
150169
try:
151170
subprocess.check_output(
152-
f"{FACTOR_IMPLEMENT_SETTINGS.python_bin} {code_path}",
171+
f"{FACTOR_IMPLEMENT_SETTINGS.python_bin} {execution_code_path}",
153172
shell=True,
154173
cwd=self.workspace_path,
155174
stderr=subprocess.STDOUT,
@@ -161,7 +180,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
161180

162181
execution_feedback = (
163182
e.output.decode()
164-
.replace(str(code_path.parent.absolute()), r"/path/to")
183+
.replace(str(execution_code_path.parent.absolute()), r"/path/to")
165184
.replace(str(site.getsitepackages()[0]), r"/path/to/site-packages")
166185
)
167186
if len(execution_feedback) > 2000:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import os
2+
3+
import numpy as np
4+
import pandas as pd
5+
from factor import feat_eng
6+
7+
if os.path.exists("valid.pkl"):
8+
valid_df = pd.read_pickle("valid.pkl")
9+
else:
10+
raise FileNotFoundError("No valid data found.")
11+
12+
new_feat = feat_eng(valid_df)
13+
new_feat.to_hdf("result.h5", key="data", mode="w")

rdagent/components/coder/model_coder/CoSTEER/evaluators.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
evaluate_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
2525

2626

27-
def shape_evaluator(prediction: torch.Tensor, target_shape: Tuple = None) -> Tuple[str, bool]:
27+
def shape_evaluator(prediction: torch.Tensor | np.ndarray, target_shape: Tuple = None) -> Tuple[str, bool]:
2828
if target_shape is None or prediction is None:
2929
return (
3030
"No output generated from the model. No shape evaluation conducted.",
@@ -279,12 +279,8 @@ def evaluate(
279279
else:
280280
gt_tensor = None
281281

282-
if target_task.model_type == "XGBoost":
283-
shape_feedback = "Not applicable for XGBoost models"
284-
shape_decision = True
285-
else:
286-
shape_feedback, shape_decision = shape_evaluator(gen_tensor, (batch_size, 1))
287-
value_feedback, value_decision = value_evaluator(gt_tensor, gen_tensor)
282+
shape_feedback, shape_decision = shape_evaluator(gen_tensor, (batch_size, 1))
283+
value_feedback, value_decision = value_evaluator(gen_tensor, gt_tensor)
288284
code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
289285
target_task=target_task,
290286
implementation=implementation,

0 commit comments

Comments
 (0)