fix: fix some errors in scenario.py, proposal.py and runner.py and several complex competition scenarios(microsoft#365)

WinstonLiyt · web-flow · commit 3c6bcabb175e · 2024-09-27T20:26:51.000+08:00
* fix several bugs in proposal and runner

* fix a bug in feedback-prize-english-language-learning

* fix some bugs and templates

* fix the bug in optiver and nlp problem
diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
@@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]):
                 st.latex(ft.factor_formulation)
 
                 mks = "| Variable | Description |\n| --- | --- |\n"
-                for v, d in ft.variables.items():
-                    mks += f"| ${v}$ | {d} |\n"
-                st.markdown(mks)
+                if isinstance(ft.variables, dict):
+                    for v, d in ft.variables.items():
+                        mks += f"| ${v}$ | {d} |\n"
+                    st.markdown(mks)
 
     elif isinstance(tasks[0], ModelTask):
         st.markdown("**Model Tasks🚩**")
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
             self.scen.vector_base.save()
         elif self.scen.if_using_graph_rag:
-            self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
+            trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
 
         return HypothesisFeedback(
             observations=observations,
diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
@@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
         codes = "\n".join(codes)
         return md5_hash(codes)
 
+    def extract_model_task_from_code(self, code: str) -> str:
+        sys_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["extract_model_task_from_code"]["system"])
+            .render()
+        )
+
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["extract_model_task_from_code"]["user"])
+            .render(file_content=code)
+        )
+
+        model_task_description = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        try:
+            response_json_analysis = json.loads(model_task_description)
+            task_desc = f"""name: {response_json_analysis['name']}
+        description: {response_json_analysis['description']}
+        """
+            task_desc += (
+                f"formulation: {response_json_analysis['formulation']}\n"
+                if response_json_analysis.get("formulation")
+                else ""
+            )
+            task_desc += f"architecture: {response_json_analysis['architecture']}\n"
+            task_desc += (
+                f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
+                if response_json_analysis.get("variables")
+                else ""
+            )
+            task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
+            task_desc += f"model_type: {response_json_analysis['model_type']}\n"
+        except json.JSONDecodeError:
+            task_desc = "Failed to parse LLM's response as JSON"
+
+        return task_desc
+
     def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
         """
         For the initial development, the experiment serves as a benchmark for feature engineering.
@@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
         feature_shape = org_data.shape[-1]
         exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))
 
-        sub_model_1_description = (
-            self.extract_model_task_from_code(
-                (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()
-            )
-            + f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}"""
-        )
-        sub_model_2_description = (
-            self.extract_model_task_from_code(
-                (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()
-            )
-            + f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}"""
-        )
+        model_map = {
+            "XGBoost": "model_xgboost.py",
+            "RandomForest": "model_randomforest.py",
+            "LightGBM": "model_lightgbm.py",
+            "NN": "model_nn.py",
+        }
+
+        workspace_path = exp.experiment_workspace.workspace_path / "model"
+
+        for model_name, model_file in model_map.items():
+            model_file_path = workspace_path / model_file
 
-        exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description
-        exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description
+            if model_file_path.exists():
+                model_description = (
+                    self.extract_model_task_from_code(model_file_path.read_text())
+                    + f"""code: {model_file_path.read_text()}"""
+                )
+            else:
+                model_description = ""
+
+            exp.experiment_workspace.model_description[model_name] = model_description
 
         if RUNNER_SETTINGS.cache_result:
             self.dump_cache_result(exp, result)
@@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
 
 
 class KGFactorRunner(KGCachedRunner[KGFactorExperiment]):
-    def extract_model_task_from_code(self, code: str) -> str:
-        sys_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["extract_model_task_from_code"]["system"])
-            .render()
-        )
-
-        user_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["extract_model_task_from_code"]["user"])
-            .render(file_content=code)
-        )
-
-        model_task_description = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=sys_prompt,
-            json_mode=True,
-        )
-
-        try:
-            response_json_analysis = json.loads(model_task_description)
-            task_desc = f"""name: {response_json_analysis['name']}
-        description: {response_json_analysis['description']}
-        """
-            task_desc += (
-                f"formulation: {response_json_analysis['formulation']}\n"
-                if response_json_analysis.get("formulation")
-                else ""
-            )
-            task_desc += f"architecture: {response_json_analysis['architecture']}\n"
-            task_desc += (
-                f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
-                if response_json_analysis.get("variables")
-                else ""
-            )
-            task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
-            task_desc += f"model_type: {response_json_analysis['model_type']}\n"
-        except json.JSONDecodeError:
-            task_desc = "Failed to parse LLM's response as JSON"
-
-        return task_desc
-
     def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
-        if exp.based_experiments and exp.based_experiments[-1].result is None:
-            exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
         current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
         implemented_factor_count = 0
         for sub_ws in exp.sub_workspace_list:
@@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
         if implemented_factor_count == 0:
             raise FactorEmptyError("No factor is implemented")
 
+        # initial template result
+        if exp.based_experiments and exp.based_experiments[-1].result is None:
+            exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
+
         if RUNNER_SETTINGS.cache_result:
             cache_hit, result = self.get_cache_result(exp)
             if cache_hit:
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
@@ -36,12 +36,8 @@ def data_cleaner(text):
 
     y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
 
-    vectorizer = TfidfVectorizer()
-    X_train = vectorizer.fit_transform(train["full_text"])
-    X_test = vectorizer.transform(test["full_text"])
-
-    X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
-    X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
+    X_train = train[["full_text"]]
+    X_test = test[["full_text"]]
 
     X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
 
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
@@ -1,4 +1,5 @@
 import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
 
 """
 Here is the feature engineering code for each task, with a class that has a fit and transform method.
@@ -11,12 +12,15 @@ def fit(self, train_df: pd.DataFrame):
         """
         Fit the feature engineering model to the training data.
         """
-        pass
+        self.vectorizer = TfidfVectorizer()
+        self.vectorizer.fit(train_df["full_text"])
 
     def transform(self, X: pd.DataFrame):
         """
         Transform the input data.
         """
+        X = self.vectorizer.transform(X["full_text"])
+        X = pd.DataFrame.sparse.from_spmatrix(X)
         return X
 
 
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py
@@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
     X_train = select(X_train)
 
     xgb_estimator = xgb.XGBRegressor(
-        n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="gpu_hist", device="cuda"
+        n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="hist", device="cuda"
     )
 
     model = MultiOutputRegressor(xgb_estimator, n_jobs=-1)
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
@@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path):
     return module
 
 
+def MCRMSE(y_true, y_pred):
+    return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))
+
+
 # 1) Preprocess the data
 X_train, X_valid, y_train, y_valid, X_test = preprocess_script()
 
@@ -24,6 +28,7 @@ def import_module_from_path(module_name, module_path):
 
 for f in DIRNAME.glob("feature/feat*.py"):
     cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    print(X_train.head())
     cls.fit(X_train)
     X_train_f = cls.transform(X_train)
     X_valid_f = cls.transform(X_valid)
@@ -62,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 
 # 4) Evaluate the model on the validation set
 y_valid_pred_l = []
+metrics_all = []
 for model, predict_func in model_l:
     y_valid_pred = predict_func(model, X_valid)
     y_valid_pred_l.append(y_valid_pred)
-    # print(y_valid_pred)
-    # print(y_valid_pred.shape)
-
-# 5) Ensemble
-# Majority vote ensemble
-y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)
-
+    metrics = MCRMSE(y_valid, y_valid_pred)
+    print(f"MCRMSE on valid set: {metrics}")
+    metrics_all.append(metrics)
 
-# 6) Save the validation metrics
-def MCRMSE(y_true, y_pred):
-    return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))
-
-
-metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
-print(f"MCRMSE on valid set: {metrics}")
-pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")
-
-# 7) Make predictions on the test set and save them
-y_test_pred_l = []
-for model, predict_func in model_l:
-    y_test_pred_l.append(predict_func(model, X_test))
+min_index = np.argmin(metrics_all)
+pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv")
 
-# For multiclass classification, use the mode of the predictions
-y_test_pred = np.mean(y_test_pred_l, axis=0)
+y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)
 
 
 submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py
@@ -93,6 +93,20 @@ def import_module_from_path(module_name, module_path):
     X_te = X_te.loc[:, ~X_te.columns.duplicated()]
 
     # Train the model
+    def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Flatten the columns of a DataFrame with MultiIndex columns,
+        for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
+        """
+        if df.columns.nlevels == 1:
+            return df
+        df.columns = ["_".join(col).strip() for col in df.columns.values]
+        return df
+
+    X_tr = flatten_columns(X_tr)
+    X_val = flatten_columns(X_val)
+    X_te = flatten_columns(X_te)
+
     model_l = []  # list[tuple[model, predict_func]]
     for f in DIRNAME.glob("model/model*.py"):
         m = import_module_from_path(f.stem, f)
diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py
@@ -11,24 +11,26 @@
 
 def prepreprocess():
     # Load the training data
-    train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
+    train_df = pd.read_csv("/kaggle/input/train.csv")
 
     # Load book and trade data
-    book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000)
-    trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000)
+    book_train = pd.read_parquet("/kaggle/input/book_train.parquet")
+    trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet")
 
     # Merge book and trade data with train_df
     merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
     merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")
 
-    print(merged_df.head())
-
     # Split the data
     X = merged_df.drop(["target"], axis=1)
     y = merged_df["target"]
 
+    print(X.columns.to_list())
+
     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
 
+    print(X_train.columns.to_list())
+
     return X_train, X_valid, y_train, y_valid
 
 
@@ -60,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame):
 def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
     X_transformed = preprocessor.transform(X)
 
-    # Convert arrays back to DataFrames
     X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)
 
     return X_transformed
@@ -79,11 +80,6 @@ def preprocess_script():
 
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
-    preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)
-
-    X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
-    X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)
-
     submission_df = pd.read_csv("/kaggle/input/test.csv")
 
     ids = submission_df["row_id"]
@@ -94,10 +90,8 @@ def preprocess_script():
         if col not in submission_df.columns:
             submission_df[col] = 0  # Fill with 0 or another appropriate value
 
-    X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)
-
     # Handle missing values
-    for df in [X_train, X_valid, X_test]:
+    for df in [X_train, X_valid, submission_df]:
         df.fillna(df.mean(), inplace=True)
 
-    return X_train, X_valid, y_train, y_valid, X_test, ids
+    return X_train, X_valid, y_train, y_valid, submission_df, ids
diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/model_xgboost.py
@@ -18,7 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
     params = {
         "objective": "reg:squarederror",  # Use squared error for regression
         "nthread": -1,
-        "tree_method": "gpu_hist",
+        "tree_method": "hist",
         "device": "cuda",
     }
     num_round = 200
diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v`
`22`	`22`	`X_train = select(X_train)`
`23`	`23`
`24`	`24`	`xgb_estimator = xgb.XGBRegressor(`
`25`		`- n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="gpu_hist", device="cuda"`
	`25`	`+ n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="hist", device="cuda"`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`model = MultiOutputRegressor(xgb_estimator, n_jobs=-1)`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v`
`18`	`18`	`params = {`
`19`	`19`	`"objective": "reg:squarederror", # Use squared error for regression`
`20`	`20`	`"nthread": -1,`
`21`		`- "tree_method": "gpu_hist",`
	`21`	`+ "tree_method": "hist",`
`22`	`22`	`"device": "cuda",`
`23`	`23`	`}`
`24`	`24`	`num_round = 200`