Skip to content

Commit 034f238

Browse files
WinstonLiytTPLin22
andauthored
feat: refine the template in several Kaggle competitions (microsoft#343)
* add an error catching in model runner * check forest & s3e11 & s4e8 & spaceship, change params of forest & s3e26 & spaceship * fix ci errors --------- Co-authored-by: TPLin22 <tplin2@163.com>
1 parent eb1e3d3 commit 034f238

File tree

16 files changed

+129
-188
lines changed

16 files changed

+129
-188
lines changed

rdagent/scenarios/kaggle/developer/runner.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
119119

120120
result = exp.experiment_workspace.execute(run_env=env_to_use)
121121

122+
if result is None:
123+
raise CoderError("No result is returned from the experiment workspace")
124+
122125
exp.result = result
123126
if RUNNER_SETTINGS.cache_result:
124127
self.dump_cache_result(exp, result)

rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py

Lines changed: 0 additions & 99 deletions
This file was deleted.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import pandas as pd
2+
from catboost import CatBoostClassifier
3+
4+
5+
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
6+
# Define CatBoost parameters
7+
cat_params = {
8+
"iterations": 5000,
9+
"learning_rate": 0.03,
10+
"od_wait": 1000,
11+
"depth": 7,
12+
"task_type": "GPU",
13+
"l2_leaf_reg": 3,
14+
"eval_metric": "Accuracy",
15+
"devices": "0",
16+
"verbose": 1000,
17+
}
18+
19+
# Initialize and train the CatBoost model
20+
model = CatBoostClassifier(**cat_params)
21+
model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
22+
23+
return model
24+
25+
26+
def predict(model, X: pd.DataFrame):
27+
# Predict using the trained model
28+
y_pred = model.predict(X)
29+
return y_pred.reshape(-1, 1)
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import numpy as np
2+
import pandas as pd
3+
import torch
4+
import torch.nn as nn
5+
import torch.optim as optim
6+
from torch.utils.data import DataLoader, TensorDataset
7+
8+
9+
# Define the neural network model with Batch Normalization
10+
class NeuralNetwork(nn.Module):
11+
def __init__(self, input_size, num_classes):
12+
super(NeuralNetwork, self).__init__()
13+
self.layer1 = nn.Linear(input_size, 128)
14+
self.bn1 = nn.BatchNorm1d(128)
15+
self.layer2 = nn.Linear(128, 64)
16+
self.bn2 = nn.BatchNorm1d(64)
17+
self.layer3 = nn.Linear(64, num_classes)
18+
19+
def forward(self, x):
20+
x = torch.relu(self.bn1(self.layer1(x)))
21+
x = torch.relu(self.bn2(self.layer2(x)))
22+
x = torch.softmax(self.layer3(x), dim=1)
23+
return x
24+
25+
26+
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
27+
# Convert data to PyTorch tensors
28+
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
29+
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
30+
X_valid_tensor = torch.tensor(X_valid.values, dtype=torch.float32)
31+
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)
32+
33+
# Create datasets and dataloaders
34+
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
35+
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)
36+
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
37+
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
38+
39+
# Initialize the model, loss function and optimizer
40+
model = NeuralNetwork(input_size=X_train.shape[1], num_classes=len(set(y_train)))
41+
criterion = nn.CrossEntropyLoss()
42+
optimizer = optim.Adam(model.parameters(), lr=0.001)
43+
44+
# Train the model
45+
num_epochs = 150
46+
for epoch in range(num_epochs):
47+
model.train()
48+
for X_batch, y_batch in train_loader:
49+
optimizer.zero_grad()
50+
outputs = model(X_batch)
51+
loss = criterion(outputs, y_batch)
52+
loss.backward()
53+
optimizer.step()
54+
55+
# Validate the model
56+
model.eval()
57+
valid_loss = 0
58+
correct = 0
59+
with torch.no_grad():
60+
for X_batch, y_batch in valid_loader:
61+
outputs = model(X_batch)
62+
valid_loss += criterion(outputs, y_batch).item()
63+
_, predicted = torch.max(outputs, 1)
64+
correct += (predicted == y_batch).sum().item()
65+
66+
accuracy = correct / len(valid_loader.dataset)
67+
print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {accuracy:.4f}")
68+
69+
return model
70+
71+
72+
def predict(model, X):
73+
X_tensor = torch.tensor(X.values, dtype=torch.float32)
74+
model.eval()
75+
with torch.no_grad():
76+
outputs = model(X_tensor)
77+
_, predicted = torch.max(outputs, 1)
78+
return predicted.numpy().reshape(-1, 1)

rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
2323
Define and train the Random Forest model. Merge feature selection into the pipeline.
2424
"""
2525
# Initialize the Random Forest model
26-
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
26+
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
2727

2828
# Select features (if any feature selection is needed)
2929
X_train_selected = select(X_train)

rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
2323
"num_class": len(set(y_train)), # Number of classes
2424
"nthread": -1,
2525
}
26-
num_round = 20
26+
num_round = 100
2727

2828
evallist = [(dtrain, "train"), (dvalid, "eval")]
2929
bst = xgb.train(params, dtrain, num_round, evallist)

rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from sklearn.impute import SimpleImputer
99
from sklearn.metrics import accuracy_score
1010
from sklearn.model_selection import KFold
11+
from sklearn.preprocessing import StandardScaler
1112

1213
# Set random seed for reproducibility
1314
SEED = 42
@@ -40,6 +41,7 @@ def import_module_from_path(module_name, module_path):
4041
# Store results
4142
accuracies = []
4243
y_test_pred_l = []
44+
scaler = StandardScaler()
4345

4446
# 3) Train and evaluate using KFold
4547
fold_number = 1
@@ -80,6 +82,11 @@ def import_module_from_path(module_name, module_path):
8082
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
8183
X_te = pd.DataFrame(imputer.transform(X_te), columns=X_te.columns)
8284

85+
# Standardize the data
86+
X_tr = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
87+
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
88+
X_te = pd.DataFrame(scaler.transform(X_te), columns=X_te.columns)
89+
8390
# Remove duplicate columns
8491
X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()]
8592
X_val = X_val.loc[:, ~X_val.columns.duplicated()]

rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/model_xgboost.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ def predict(model, X_test):
3939
"""
4040
X_test = select(X_test)
4141
y_pred = model.predict(X_test)
42-
return y_pred
42+
return y_pred.reshape(-1, 1)

rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
8585

8686

8787
# For multiclass classification, use the mode of the predictions
88-
y_test_pred = np.mean(y_test_pred_l, axis=0)
88+
y_test_pred = np.mean(y_test_pred_l, axis=0).ravel()
8989

9090

9191
submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"])

rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/model_randomforest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
2323
Define and train the Random Forest model. Merge feature selection into the pipeline.
2424
"""
2525
# Initialize the Random Forest model
26-
model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)
26+
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
2727

2828
# Select features (if any feature selection is needed)
2929
X_train_selected = select(X_train)

0 commit comments

Comments
 (0)