Skip to content

Commit e0b03ba

Browse files
authored
feat: add cross validation for kaggle scenario (microsoft#236)
* update cross validation for kaggle scenario * CI Issues * delete useless file * CI issues
1 parent 9fc552a commit e0b03ba

File tree

1 file changed

+89
-0
lines changed

1 file changed

+89
-0
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
from pathlib import Path
2+
3+
import numpy as np
4+
import pandas as pd
5+
import xgboost as xgb
6+
from sklearn.metrics import accuracy_score, matthews_corrcoef
7+
from sklearn.model_selection import KFold
8+
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
9+
10+
from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess
11+
12+
13+
def compute_metrics_for_classification(y_true, y_pred):
14+
"""Compute MCC for classification."""
15+
from sklearn.metrics import matthews_corrcoef
16+
17+
return matthews_corrcoef(y_true, y_pred)
18+
19+
20+
def perform_kfold_cross_validation(X, y, n_splits=2, random_seed=42):
21+
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
22+
fold_metrics = []
23+
24+
DIRNAME = Path(__file__).absolute().resolve().parent
25+
26+
for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
27+
X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
28+
y_train_fold, y_valid_fold = y[train_idx], y[valid_idx]
29+
30+
# TODO: Preprocess and Feature Engineering before K-Fold CV
31+
32+
# Preprocess the data
33+
X_train_fold = preprocess(X_train_fold)
34+
X_valid_fold = preprocess(X_valid_fold)
35+
36+
# Feature Engineering
37+
X_train_l_fold, X_valid_l_fold = [], []
38+
for f in DIRNAME.glob("feat*.py"):
39+
m = __import__(f.name.strip(".py"))
40+
X_train_fold = m.feat_eng(X_train_fold)
41+
X_valid_fold = m.feat_eng(X_valid_fold)
42+
43+
X_train_l_fold.append(X_train_fold)
44+
X_valid_l_fold.append(X_valid_fold)
45+
46+
X_train_fold = pd.concat(X_train_l_fold, axis=1)
47+
X_valid_fold = pd.concat(X_valid_l_fold, axis=1)
48+
49+
# Align features
50+
X_valid_fold = X_valid_fold.reindex(columns=X_train_fold.columns, fill_value=0)
51+
52+
# Train and evaluate models
53+
mcc_scores = []
54+
model_l = [] # Reinitialize model list
55+
for f in DIRNAME.glob("model*.py"):
56+
m = __import__(f.name.strip(".py"))
57+
model = m.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)
58+
y_valid_pred = m.predict(model, X_valid_fold)
59+
mcc = compute_metrics_for_classification(y_valid_fold, y_valid_pred)
60+
mcc_scores.append(mcc)
61+
print(f"Fold {fold+1}, Model {f.name}: MCC = {mcc}")
62+
63+
# Store the average MCC score for this fold
64+
avg_mcc = np.mean(mcc_scores)
65+
fold_metrics.append(avg_mcc)
66+
print(f"Fold {fold+1} average MCC: {avg_mcc}")
67+
68+
# Calculate the overall average MCC
69+
overall_avg_mcc = np.mean(fold_metrics)
70+
result_df = pd.DataFrame({"Overall Average MCC": [overall_avg_mcc]})
71+
result_df.to_csv(f"path/to/playground-series-s4e8/cv_score_{f.name.strip('.py')}.csv", index=False)
72+
73+
print(f"Overall Average MCC across all folds: {overall_avg_mcc}")
74+
return overall_avg_mcc
75+
76+
77+
# This allows the script to be run directly
78+
if __name__ == "__main__":
79+
# Load and preprocess the data
80+
data_df = pd.read_csv("path/to/playground-series-s4e8/train.csv")
81+
data_df = data_df.drop(["id"], axis=1)
82+
83+
X = data_df.drop(["class"], axis=1)
84+
y = data_df[["class"]]
85+
86+
label_encoder = LabelEncoder()
87+
# transfrom y to 1D
88+
y = label_encoder.fit_transform(y)
89+
result = perform_kfold_cross_validation(X, y)

0 commit comments

Comments
 (0)