Skip to content

Commit 3c6bcab

Browse files
authored
fix: fix some errors in scenario.py, proposal.py and runner.py and several complex competition scenarios(microsoft#365)
* fix several bugs in proposal and runner * fix a bug in feedback-prize-english-language-learning * fix some bugs and templates * fix the bug in optiver and nlp problem
1 parent 312029d commit 3c6bcab

File tree

14 files changed

+155
-171
lines changed

14 files changed

+155
-171
lines changed

rdagent/log/ui/app.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]):
443443
st.latex(ft.factor_formulation)
444444

445445
mks = "| Variable | Description |\n| --- | --- |\n"
446-
for v, d in ft.variables.items():
447-
mks += f"| ${v}$ | {d} |\n"
448-
st.markdown(mks)
446+
if isinstance(ft.variables, dict):
447+
for v, d in ft.variables.items():
448+
mks += f"| ${v}$ | {d} |\n"
449+
st.markdown(mks)
449450

450451
elif isinstance(tasks[0], ModelTask):
451452
st.markdown("**Model Tasks🚩**")

rdagent/scenarios/kaggle/developer/feedback.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
163163
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
164164
self.scen.vector_base.save()
165165
elif self.scen.if_using_graph_rag:
166-
self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
166+
trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
167167

168168
return HypothesisFeedback(
169169
observations=observations,

rdagent/scenarios/kaggle/developer/runner.py

Lines changed: 66 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
3232
codes = "\n".join(codes)
3333
return md5_hash(codes)
3434

35+
def extract_model_task_from_code(self, code: str) -> str:
36+
sys_prompt = (
37+
Environment(undefined=StrictUndefined)
38+
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
39+
.render()
40+
)
41+
42+
user_prompt = (
43+
Environment(undefined=StrictUndefined)
44+
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
45+
.render(file_content=code)
46+
)
47+
48+
model_task_description = APIBackend().build_messages_and_create_chat_completion(
49+
user_prompt=user_prompt,
50+
system_prompt=sys_prompt,
51+
json_mode=True,
52+
)
53+
54+
try:
55+
response_json_analysis = json.loads(model_task_description)
56+
task_desc = f"""name: {response_json_analysis['name']}
57+
description: {response_json_analysis['description']}
58+
"""
59+
task_desc += (
60+
f"formulation: {response_json_analysis['formulation']}\n"
61+
if response_json_analysis.get("formulation")
62+
else ""
63+
)
64+
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
65+
task_desc += (
66+
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
67+
if response_json_analysis.get("variables")
68+
else ""
69+
)
70+
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
71+
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
72+
except json.JSONDecodeError:
73+
task_desc = "Failed to parse LLM's response as JSON"
74+
75+
return task_desc
76+
3577
def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
3678
"""
3779
For the initial development, the experiment serves as a benchmark for feature engineering.
@@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
59101
feature_shape = org_data.shape[-1]
60102
exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))
61103

62-
sub_model_1_description = (
63-
self.extract_model_task_from_code(
64-
(exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()
65-
)
66-
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}"""
67-
)
68-
sub_model_2_description = (
69-
self.extract_model_task_from_code(
70-
(exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()
71-
)
72-
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}"""
73-
)
104+
model_map = {
105+
"XGBoost": "model_xgboost.py",
106+
"RandomForest": "model_randomforest.py",
107+
"LightGBM": "model_lightgbm.py",
108+
"NN": "model_nn.py",
109+
}
110+
111+
workspace_path = exp.experiment_workspace.workspace_path / "model"
112+
113+
for model_name, model_file in model_map.items():
114+
model_file_path = workspace_path / model_file
74115

75-
exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description
76-
exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description
116+
if model_file_path.exists():
117+
model_description = (
118+
self.extract_model_task_from_code(model_file_path.read_text())
119+
+ f"""code: {model_file_path.read_text()}"""
120+
)
121+
else:
122+
model_description = ""
123+
124+
exp.experiment_workspace.model_description[model_name] = model_description
77125

78126
if RUNNER_SETTINGS.cache_result:
79127
self.dump_cache_result(exp, result)
@@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
120168

121169

122170
class KGFactorRunner(KGCachedRunner[KGFactorExperiment]):
123-
def extract_model_task_from_code(self, code: str) -> str:
124-
sys_prompt = (
125-
Environment(undefined=StrictUndefined)
126-
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
127-
.render()
128-
)
129-
130-
user_prompt = (
131-
Environment(undefined=StrictUndefined)
132-
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
133-
.render(file_content=code)
134-
)
135-
136-
model_task_description = APIBackend().build_messages_and_create_chat_completion(
137-
user_prompt=user_prompt,
138-
system_prompt=sys_prompt,
139-
json_mode=True,
140-
)
141-
142-
try:
143-
response_json_analysis = json.loads(model_task_description)
144-
task_desc = f"""name: {response_json_analysis['name']}
145-
description: {response_json_analysis['description']}
146-
"""
147-
task_desc += (
148-
f"formulation: {response_json_analysis['formulation']}\n"
149-
if response_json_analysis.get("formulation")
150-
else ""
151-
)
152-
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
153-
task_desc += (
154-
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
155-
if response_json_analysis.get("variables")
156-
else ""
157-
)
158-
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
159-
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
160-
except json.JSONDecodeError:
161-
task_desc = "Failed to parse LLM's response as JSON"
162-
163-
return task_desc
164-
165171
def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
166-
if exp.based_experiments and exp.based_experiments[-1].result is None:
167-
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
168172
current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
169173
implemented_factor_count = 0
170174
for sub_ws in exp.sub_workspace_list:
@@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
179183
if implemented_factor_count == 0:
180184
raise FactorEmptyError("No factor is implemented")
181185

186+
# initial template result
187+
if exp.based_experiments and exp.based_experiments[-1].result is None:
188+
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
189+
182190
if RUNNER_SETTINGS.cache_result:
183191
cache_hit, result = self.get_cache_result(exp)
184192
if cache_hit:

rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,8 @@ def data_cleaner(text):
3636

3737
y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
3838

39-
vectorizer = TfidfVectorizer()
40-
X_train = vectorizer.fit_transform(train["full_text"])
41-
X_test = vectorizer.transform(test["full_text"])
42-
43-
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
44-
X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
39+
X_train = train[["full_text"]]
40+
X_test = test[["full_text"]]
4541

4642
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
4743

rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pandas as pd
2+
from sklearn.feature_extraction.text import TfidfVectorizer
23

34
"""
45
Here is the feature engineering code for each task, with a class that has a fit and transform method.
@@ -11,12 +12,15 @@ def fit(self, train_df: pd.DataFrame):
1112
"""
1213
Fit the feature engineering model to the training data.
1314
"""
14-
pass
15+
self.vectorizer = TfidfVectorizer()
16+
self.vectorizer.fit(train_df["full_text"])
1517

1618
def transform(self, X: pd.DataFrame):
1719
"""
1820
Transform the input data.
1921
"""
22+
X = self.vectorizer.transform(X["full_text"])
23+
X = pd.DataFrame.sparse.from_spmatrix(X)
2024
return X
2125

2226

rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
2222
X_train = select(X_train)
2323

2424
xgb_estimator = xgb.XGBRegressor(
25-
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="gpu_hist", device="cuda"
25+
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="hist", device="cuda"
2626
)
2727

2828
model = MultiOutputRegressor(xgb_estimator, n_jobs=-1)

rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path):
1515
return module
1616

1717

18+
def MCRMSE(y_true, y_pred):
19+
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))
20+
21+
1822
# 1) Preprocess the data
1923
X_train, X_valid, y_train, y_valid, X_test = preprocess_script()
2024

@@ -24,6 +28,7 @@ def import_module_from_path(module_name, module_path):
2428

2529
for f in DIRNAME.glob("feature/feat*.py"):
2630
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
31+
print(X_train.head())
2732
cls.fit(X_train)
2833
X_train_f = cls.transform(X_train)
2934
X_valid_f = cls.transform(X_valid)
@@ -62,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
6267

6368
# 4) Evaluate the model on the validation set
6469
y_valid_pred_l = []
70+
metrics_all = []
6571
for model, predict_func in model_l:
6672
y_valid_pred = predict_func(model, X_valid)
6773
y_valid_pred_l.append(y_valid_pred)
68-
# print(y_valid_pred)
69-
# print(y_valid_pred.shape)
70-
71-
# 5) Ensemble
72-
# Majority vote ensemble
73-
y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)
74-
74+
metrics = MCRMSE(y_valid, y_valid_pred)
75+
print(f"MCRMSE on valid set: {metrics}")
76+
metrics_all.append(metrics)
7577

76-
# 6) Save the validation metrics
77-
def MCRMSE(y_true, y_pred):
78-
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))
79-
80-
81-
metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
82-
print(f"MCRMSE on valid set: {metrics}")
83-
pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")
84-
85-
# 7) Make predictions on the test set and save them
86-
y_test_pred_l = []
87-
for model, predict_func in model_l:
88-
y_test_pred_l.append(predict_func(model, X_test))
78+
min_index = np.argmin(metrics_all)
79+
pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv")
8980

90-
# For multiclass classification, use the mode of the predictions
91-
y_test_pred = np.mean(y_test_pred_l, axis=0)
81+
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)
9282

9383

9484
submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")

rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,20 @@ def import_module_from_path(module_name, module_path):
9393
X_te = X_te.loc[:, ~X_te.columns.duplicated()]
9494

9595
# Train the model
96+
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
97+
"""
98+
Flatten the columns of a DataFrame with MultiIndex columns,
99+
for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
100+
"""
101+
if df.columns.nlevels == 1:
102+
return df
103+
df.columns = ["_".join(col).strip() for col in df.columns.values]
104+
return df
105+
106+
X_tr = flatten_columns(X_tr)
107+
X_val = flatten_columns(X_val)
108+
X_te = flatten_columns(X_te)
109+
96110
model_l = [] # list[tuple[model, predict_func]]
97111
for f in DIRNAME.glob("model/model*.py"):
98112
m = import_module_from_path(f.stem, f)

rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,26 @@
1111

1212
def prepreprocess():
1313
# Load the training data
14-
train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
14+
train_df = pd.read_csv("/kaggle/input/train.csv")
1515

1616
# Load book and trade data
17-
book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000)
18-
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000)
17+
book_train = pd.read_parquet("/kaggle/input/book_train.parquet")
18+
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet")
1919

2020
# Merge book and trade data with train_df
2121
merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
2222
merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")
2323

24-
print(merged_df.head())
25-
2624
# Split the data
2725
X = merged_df.drop(["target"], axis=1)
2826
y = merged_df["target"]
2927

28+
print(X.columns.to_list())
29+
3030
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
3131

32+
print(X_train.columns.to_list())
33+
3234
return X_train, X_valid, y_train, y_valid
3335

3436

@@ -60,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame):
6062
def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
6163
X_transformed = preprocessor.transform(X)
6264

63-
# Convert arrays back to DataFrames
6465
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)
6566

6667
return X_transformed
@@ -79,11 +80,6 @@ def preprocess_script():
7980

8081
X_train, X_valid, y_train, y_valid = prepreprocess()
8182

82-
preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)
83-
84-
X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
85-
X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)
86-
8783
submission_df = pd.read_csv("/kaggle/input/test.csv")
8884

8985
ids = submission_df["row_id"]
@@ -94,10 +90,8 @@ def preprocess_script():
9490
if col not in submission_df.columns:
9591
submission_df[col] = 0 # Fill with 0 or another appropriate value
9692

97-
X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)
98-
9993
# Handle missing values
100-
for df in [X_train, X_valid, X_test]:
94+
for df in [X_train, X_valid, submission_df]:
10195
df.fillna(df.mean(), inplace=True)
10296

103-
return X_train, X_valid, y_train, y_valid, X_test, ids
97+
return X_train, X_valid, y_train, y_valid, submission_df, ids

rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/model_xgboost.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
1818
params = {
1919
"objective": "reg:squarederror", # Use squared error for regression
2020
"nthread": -1,
21-
"tree_method": "gpu_hist",
21+
"tree_method": "hist",
2222
"device": "cuda",
2323
}
2424
num_round = 200

0 commit comments

Comments
 (0)