fix the bug in optiver and nlp problem

microsoft · WinstonLiyt · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
commit ccb95c4f052df07ca0b547158e74dbe0befcc3de
diff --git a/...ggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/...ggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
@@ -38,12 +38,6 @@ def data_cleaner(text):
 
     X_train = train[["full_text"]]
     X_test = test[["full_text"]]
-    # vectorizer = TfidfVectorizer()
-    # X_train = vectorizer.fit_transform(train["full_text"])
-    # X_test = vectorizer.transform(test["full_text"])
-
-    # X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
-    # X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
 
     X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
 

diff --git a/...nt/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/...nt/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
@@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path):
     return module
 
 
+def MCRMSE(y_true, y_pred):
+    return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))
+
+
 # 1) Preprocess the data
 X_train, X_valid, y_train, y_valid, X_test = preprocess_script()
 
@@ -63,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 
 # 4) Evaluate the model on the validation set
 y_valid_pred_l = []
+metrics_all = []
 for model, predict_func in model_l:
     y_valid_pred = predict_func(model, X_valid)
     y_valid_pred_l.append(y_valid_pred)
-    # print(y_valid_pred)
-    # print(y_valid_pred.shape)
-
-# 5) Ensemble
-# Majority vote ensemble
-y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)
-
+    metrics = MCRMSE(y_valid, y_valid_pred)
+    print(f"MCRMSE on valid set: {metrics}")
+    metrics_all.append(metrics)
 
-# 6) Save the validation metrics
-def MCRMSE(y_true, y_pred):
-    return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))
-
-
-metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
-print(f"MCRMSE on valid set: {metrics}")
-pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")
-
-# 7) Make predictions on the test set and save them
-y_test_pred_l = []
-for model, predict_func in model_l:
-    y_test_pred_l.append(predict_func(model, X_test))
+min_index = np.argmin(metrics_all)
+pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv")
 
-# For multiclass classification, use the mode of the predictions
-y_test_pred = np.mean(y_test_pred_l, axis=0)
+y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)
 
 
 submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")

diff --git a/...kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py b/...kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py
@@ -11,21 +11,16 @@
 
 def prepreprocess():
     # Load the training data
-    train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
+    train_df = pd.read_csv("/kaggle/input/train.csv")
 
     # Load book and trade data
-    book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000)
-    trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000)
+    book_train = pd.read_parquet("/kaggle/input/book_train.parquet")
+    trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet")
 
     # Merge book and trade data with train_df
-    # print(book_train.columns.to_list())
-    # print(trade_train.columns.to_list())
-    # print(train_df.columns.to_list())
     merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
     merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")
 
-    # print(merged_df.columns.to_list())
-
     # Split the data
     X = merged_df.drop(["target"], axis=1)
     y = merged_df["target"]
@@ -67,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame):
 def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
     X_transformed = preprocessor.transform(X)
 
-    # Convert arrays back to DataFrames
     X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)
 
     return X_transformed
@@ -85,12 +79,6 @@ def preprocess_script():
         return X_train, X_valid, y_train, y_valid, X_test, *others
 
     X_train, X_valid, y_train, y_valid = prepreprocess()
-    print(X_train.columns.to_list())
-
-    # preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)
-
-    # X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
-    # X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)
 
     submission_df = pd.read_csv("/kaggle/input/test.csv")
 
@@ -102,8 +90,6 @@ def preprocess_script():
         if col not in submission_df.columns:
             submission_df[col] = 0  # Fill with 0 or another appropriate value
 
-    # X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)
-
     # Handle missing values
     for df in [X_train, X_valid, submission_df]:
         df.fillna(df.mean(), inplace=True)

diff --git a/...rios/kaggle/experiment/optiver-realized-volatility-prediction_template/feature/feature.py b/...rios/kaggle/experiment/optiver-realized-volatility-prediction_template/feature/feature.py
@@ -1,17 +1,23 @@
 import pandas as pd
 
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
 
-class MidPriceFeature:
+
+class IdentityFeature:
     def fit(self, train_df: pd.DataFrame):
-        return self
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
 
     def transform(self, X: pd.DataFrame):
-        # Check if the required columns exist in the DataFrame
-        if "bid_price1" not in X.columns or "ask_price1" not in X.columns:
-            print("Warning: Required columns bid_price1 and ask_price1 are missing from the DataFrame")
-            return pd.DataFrame(index=X.index)
-        X["mid_price"] = (X["bid_price1"] + X["ask_price1"]) / 2
-        return X[["mid_price"]]
+        """
+        Transform the input data.
+        """
+        return X
 
 
-feature_engineering_cls = MidPriceFeature
+feature_engineering_cls = IdentityFeature
diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py
@@ -5,9 +5,7 @@
 import numpy as np
 import pandas as pd
 from fea_share_preprocess import preprocess_script
-from sklearn.metrics import mean_squared_error
-from sklearn.model_selection import TimeSeriesSplit
-from sklearn.preprocessing import LabelEncoder
+from sklearn.impute import SimpleImputer
 
 # Set random seed for reproducibility
 SEED = 42
@@ -17,7 +15,7 @@
 
 
 def compute_rmspe(y_true, y_pred):
-    """Compute RMSPE for regression."""
+    """Compute Root Mean Squared Percentage Error (RMSPE) for regression."""
     rmspe = np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2))
     return rmspe
 
@@ -29,17 +27,14 @@ def import_module_from_path(module_name, module_path):
     return module
 
 
-print("begin preprocess")
 # 1) Preprocess the data
 X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
-print("preprocess done")
+
 
 # 2) Auto feature engineering
 X_train_l, X_valid_l = [], []
 X_test_l = []
 
-print(X_train.columns.tolist())
-
 for f in DIRNAME.glob("feature/feat*.py"):
     cls = import_module_from_path(f.stem, f).feature_engineering_cls()
     cls.fit(X_train)
@@ -62,8 +57,6 @@ def import_module_from_path(module_name, module_path):
 X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
 X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
 
-from sklearn.impute import SimpleImputer
-
 imputer = SimpleImputer(strategy="mean")
 
 X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
@@ -100,17 +93,19 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 # 4) Evaluate the model on the validation set
 y_valid_pred_l = []
 metrics_all = []
+
 for model, predict_func in model_l:
     y_valid_pred_l.append(predict_func(model, X_valid))
     metrics = compute_rmspe(y_valid, y_valid_pred_l[-1].ravel())
     print(f"RMSPE on valid set: {metrics}")
     metrics_all.append(metrics)
 
 min_index = np.argmin(metrics_all)
+
 pd.Series(data=[metrics_all[min_index]], index=["RMSPE"]).to_csv("submission_score.csv")
 
-y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)
-print(y_test_pred)
+y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test).ravel()
+
 # 5) Submit predictions for the test set
 submission_result = pd.DataFrame({"row_id": ids, "target": y_test_pred})
 submission_result.to_csv("submission.csv", index=False)