Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
fix the bug in optiver and nlp problem
  • Loading branch information
WinstonLiyt committed Sep 27, 2024
commit ccb95c4f052df07ca0b547158e74dbe0befcc3de
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ def data_cleaner(text):

X_train = train[["full_text"]]
X_test = test[["full_text"]]
# vectorizer = TfidfVectorizer()
# X_train = vectorizer.fit_transform(train["full_text"])
# X_test = vectorizer.transform(test["full_text"])

# X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
# X_test = pd.DataFrame.sparse.from_spmatrix(X_test)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path):
return module


def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test = preprocess_script()

Expand Down Expand Up @@ -63,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
metrics_all = []
for model, predict_func in model_l:
y_valid_pred = predict_func(model, X_valid)
y_valid_pred_l.append(y_valid_pred)
# print(y_valid_pred)
# print(y_valid_pred.shape)

# 5) Ensemble
# Majority vote ensemble
y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)

metrics = MCRMSE(y_valid, y_valid_pred)
print(f"MCRMSE on valid set: {metrics}")
metrics_all.append(metrics)

# 6) Save the validation metrics
def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
print(f"MCRMSE on valid set: {metrics}")
pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for model, predict_func in model_l:
y_test_pred_l.append(predict_func(model, X_test))
min_index = np.argmin(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv")

# For multiclass classification, use the mode of the predictions
y_test_pred = np.mean(y_test_pred_l, axis=0)
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)


submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,16 @@

def prepreprocess():
# Load the training data
train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
train_df = pd.read_csv("/kaggle/input/train.csv")

# Load book and trade data
book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000)
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000)
book_train = pd.read_parquet("/kaggle/input/book_train.parquet")
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet")

# Merge book and trade data with train_df
# print(book_train.columns.to_list())
# print(trade_train.columns.to_list())
# print(train_df.columns.to_list())
merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")

# print(merged_df.columns.to_list())

# Split the data
X = merged_df.drop(["target"], axis=1)
y = merged_df["target"]
Expand Down Expand Up @@ -67,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame):
def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
X_transformed = preprocessor.transform(X)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)

return X_transformed
Expand All @@ -85,12 +79,6 @@ def preprocess_script():
return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid = prepreprocess()
print(X_train.columns.to_list())

# preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)

# X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
# X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)

submission_df = pd.read_csv("/kaggle/input/test.csv")

Expand All @@ -102,8 +90,6 @@ def preprocess_script():
if col not in submission_df.columns:
submission_df[col] = 0 # Fill with 0 or another appropriate value

# X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)

# Handle missing values
for df in [X_train, X_valid, submission_df]:
df.fillna(df.mean(), inplace=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""

class MidPriceFeature:

class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
return self
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
# Check if the required columns exist in the DataFrame
if "bid_price1" not in X.columns or "ask_price1" not in X.columns:
print("Warning: Required columns bid_price1 and ask_price1 are missing from the DataFrame")
return pd.DataFrame(index=X.index)
X["mid_price"] = (X["bid_price1"] + X["ask_price1"]) / 2
return X[["mid_price"]]
"""
Transform the input data.
"""
return X


feature_engineering_cls = MidPriceFeature
feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
import numpy as np
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Set random seed for reproducibility
SEED = 42
Expand All @@ -17,7 +15,7 @@


def compute_rmspe(y_true, y_pred):
"""Compute RMSPE for regression."""
"""Compute Root Mean Squared Percentage Error (RMSPE) for regression."""
rmspe = np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2))
return rmspe

Expand All @@ -29,17 +27,14 @@ def import_module_from_path(module_name, module_path):
return module


print("begin preprocess")
# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
print("preprocess done")


# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

print(X_train.columns.tolist())

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
Expand All @@ -62,8 +57,6 @@ def import_module_from_path(module_name, module_path):
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
Expand Down Expand Up @@ -100,17 +93,19 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
# 4) Evaluate the model on the validation set
y_valid_pred_l = []
metrics_all = []

for model, predict_func in model_l:
y_valid_pred_l.append(predict_func(model, X_valid))
metrics = compute_rmspe(y_valid, y_valid_pred_l[-1].ravel())
print(f"RMSPE on valid set: {metrics}")
metrics_all.append(metrics)

min_index = np.argmin(metrics_all)

pd.Series(data=[metrics_all[min_index]], index=["RMSPE"]).to_csv("submission_score.csv")

y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)
print(y_test_pred)
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test).ravel()

# 5) Submit predictions for the test set
submission_result = pd.DataFrame({"row_id": ids, "target": y_test_pred})
submission_result.to_csv("submission.csv", index=False)