Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions rdagent/log/ui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]):
st.latex(ft.factor_formulation)

mks = "| Variable | Description |\n| --- | --- |\n"
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)
if isinstance(ft.variables, dict):
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)

elif isinstance(tasks[0], ModelTask):
st.markdown("**Model Tasks🚩**")
Expand Down
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
self.scen.vector_base.save()
elif self.scen.if_using_graph_rag:
self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)

return HypothesisFeedback(
observations=observations,
Expand Down
124 changes: 66 additions & 58 deletions rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
codes = "\n".join(codes)
return md5_hash(codes)

def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
"""
For the initial development, the experiment serves as a benchmark for feature engineering.
Expand Down Expand Up @@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
feature_shape = org_data.shape[-1]
exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))

sub_model_1_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}"""
)
sub_model_2_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}"""
)
model_map = {
"XGBoost": "model_xgboost.py",
"RandomForest": "model_randomforest.py",
"LightGBM": "model_lightgbm.py",
"NN": "model_nn.py",
}

workspace_path = exp.experiment_workspace.workspace_path / "model"

for model_name, model_file in model_map.items():
model_file_path = workspace_path / model_file

exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description
exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description
if model_file_path.exists():
model_description = (
self.extract_model_task_from_code(model_file_path.read_text())
+ f"""code: {model_file_path.read_text()}"""
)
else:
model_description = ""

exp.experiment_workspace.model_description[model_name] = model_description

if RUNNER_SETTINGS.cache_result:
self.dump_cache_result(exp, result)
Expand Down Expand Up @@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:


class KGFactorRunner(KGCachedRunner[KGFactorExperiment]):
def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
implemented_factor_count = 0
for sub_ws in exp.sub_workspace_list:
Expand All @@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if implemented_factor_count == 0:
raise FactorEmptyError("No factor is implemented")

# initial template result
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])

if RUNNER_SETTINGS.cache_result:
cache_hit, result = self.get_cache_result(exp)
if cache_hit:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@ def data_cleaner(text):

y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train["full_text"])
X_test = vectorizer.transform(test["full_text"])

X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
X_train = train[["full_text"]]
X_test = test[["full_text"]]

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Expand All @@ -11,12 +12,15 @@ def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass
self.vectorizer = TfidfVectorizer()
self.vectorizer.fit(train_df["full_text"])

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
X = self.vectorizer.transform(X["full_text"])
X = pd.DataFrame.sparse.from_spmatrix(X)
return X


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
X_train = select(X_train)

xgb_estimator = xgb.XGBRegressor(
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="gpu_hist", device="cuda"
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="hist", device="cuda"
)

model = MultiOutputRegressor(xgb_estimator, n_jobs=-1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path):
return module


def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test = preprocess_script()

Expand All @@ -24,6 +28,7 @@ def import_module_from_path(module_name, module_path):

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
print(X_train.head())
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
Expand Down Expand Up @@ -62,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
metrics_all = []
for model, predict_func in model_l:
y_valid_pred = predict_func(model, X_valid)
y_valid_pred_l.append(y_valid_pred)
# print(y_valid_pred)
# print(y_valid_pred.shape)

# 5) Ensemble
# Majority vote ensemble
y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)

metrics = MCRMSE(y_valid, y_valid_pred)
print(f"MCRMSE on valid set: {metrics}")
metrics_all.append(metrics)

# 6) Save the validation metrics
def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
print(f"MCRMSE on valid set: {metrics}")
pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for model, predict_func in model_l:
y_test_pred_l.append(predict_func(model, X_test))
min_index = np.argmin(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv")

# For multiclass classification, use the mode of the predictions
y_test_pred = np.mean(y_test_pred_l, axis=0)
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)


submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,20 @@ def import_module_from_path(module_name, module_path):
X_te = X_te.loc[:, ~X_te.columns.duplicated()]

# Train the model
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Flatten the columns of a DataFrame with MultiIndex columns,
for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
"""
if df.columns.nlevels == 1:
return df
df.columns = ["_".join(col).strip() for col in df.columns.values]
return df

X_tr = flatten_columns(X_tr)
X_val = flatten_columns(X_val)
X_te = flatten_columns(X_te)

model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,26 @@

def prepreprocess():
# Load the training data
train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
train_df = pd.read_csv("/kaggle/input/train.csv")

# Load book and trade data
book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000)
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000)
book_train = pd.read_parquet("/kaggle/input/book_train.parquet")
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet")

# Merge book and trade data with train_df
merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")

print(merged_df.head())

# Split the data
X = merged_df.drop(["target"], axis=1)
y = merged_df["target"]

print(X.columns.to_list())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.columns.to_list())

return X_train, X_valid, y_train, y_valid


Expand Down Expand Up @@ -60,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame):
def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
X_transformed = preprocessor.transform(X)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)

return X_transformed
Expand All @@ -79,11 +80,6 @@ def preprocess_script():

X_train, X_valid, y_train, y_valid = prepreprocess()

preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)

X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)

submission_df = pd.read_csv("/kaggle/input/test.csv")

ids = submission_df["row_id"]
Expand All @@ -94,10 +90,8 @@ def preprocess_script():
if col not in submission_df.columns:
submission_df[col] = 0 # Fill with 0 or another appropriate value

X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)

# Handle missing values
for df in [X_train, X_valid, X_test]:
for df in [X_train, X_valid, submission_df]:
df.fillna(df.mean(), inplace=True)

return X_train, X_valid, y_train, y_valid, X_test, ids
return X_train, X_valid, y_train, y_valid, submission_df, ids
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
params = {
"objective": "reg:squarederror", # Use squared error for regression
"nthread": -1,
"tree_method": "gpu_hist",
"tree_method": "hist",
"device": "cuda",
}
num_round = 200
Expand Down
Loading