feat: refine the template in several Kaggle competitions (microsoft#343)

WinstonLiyt · TPLin22 · web-flow · commit 034f238ed5ec · 2024-09-26T02:12:07.000+08:00
* add an error catching in model runner

* check forest &amp; s3e11 &amp; s4e8 &amp; spaceship, change params of forest &amp; s3e26 &amp; spaceship

* fix ci errors

---------

Co-authored-by: TPLin22 &lt;tplin2@163.com&gt;
diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
@@ -119,6 +119,9 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
 
         result = exp.experiment_workspace.execute(run_env=env_to_use)
 
+        if result is None:
+            raise CoderError("No result is returned from the experiment workspace")
+
         exp.result = result
         if RUNNER_SETTINGS.cache_result:
             self.dump_cache_result(exp, result)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_catboost.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_catboost.py
@@ -0,0 +1,29 @@
+import pandas as pd
+from catboost import CatBoostClassifier
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    # Define CatBoost parameters
+    cat_params = {
+        "iterations": 5000,
+        "learning_rate": 0.03,
+        "od_wait": 1000,
+        "depth": 7,
+        "task_type": "GPU",
+        "l2_leaf_reg": 3,
+        "eval_metric": "Accuracy",
+        "devices": "0",
+        "verbose": 1000,
+    }
+
+    # Initialize and train the CatBoost model
+    model = CatBoostClassifier(**cat_params)
+    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
+
+    return model
+
+
+def predict(model, X: pd.DataFrame):
+    # Predict using the trained model
+    y_pred = model.predict(X)
+    return y_pred.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_dnn.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_dnn.py
@@ -0,0 +1,78 @@
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+
+
+# Define the neural network model with Batch Normalization
+class NeuralNetwork(nn.Module):
+    def __init__(self, input_size, num_classes):
+        super(NeuralNetwork, self).__init__()
+        self.layer1 = nn.Linear(input_size, 128)
+        self.bn1 = nn.BatchNorm1d(128)
+        self.layer2 = nn.Linear(128, 64)
+        self.bn2 = nn.BatchNorm1d(64)
+        self.layer3 = nn.Linear(64, num_classes)
+
+    def forward(self, x):
+        x = torch.relu(self.bn1(self.layer1(x)))
+        x = torch.relu(self.bn2(self.layer2(x)))
+        x = torch.softmax(self.layer3(x), dim=1)
+        return x
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    # Convert data to PyTorch tensors
+    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
+    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
+    X_valid_tensor = torch.tensor(X_valid.values, dtype=torch.float32)
+    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)
+
+    # Create datasets and dataloaders
+    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
+    valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)
+    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
+
+    # Initialize the model, loss function and optimizer
+    model = NeuralNetwork(input_size=X_train.shape[1], num_classes=len(set(y_train)))
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+    # Train the model
+    num_epochs = 150
+    for epoch in range(num_epochs):
+        model.train()
+        for X_batch, y_batch in train_loader:
+            optimizer.zero_grad()
+            outputs = model(X_batch)
+            loss = criterion(outputs, y_batch)
+            loss.backward()
+            optimizer.step()
+
+        # Validate the model
+        model.eval()
+        valid_loss = 0
+        correct = 0
+        with torch.no_grad():
+            for X_batch, y_batch in valid_loader:
+                outputs = model(X_batch)
+                valid_loss += criterion(outputs, y_batch).item()
+                _, predicted = torch.max(outputs, 1)
+                correct += (predicted == y_batch).sum().item()
+
+        accuracy = correct / len(valid_loader.dataset)
+        print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {accuracy:.4f}")
+
+    return model
+
+
+def predict(model, X):
+    X_tensor = torch.tensor(X.values, dtype=torch.float32)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(X_tensor)
+        _, predicted = torch.max(outputs, 1)
+    return predicted.numpy().reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
+    model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
 
     # Select features (if any feature selection is needed)
     X_train_selected = select(X_train)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
         "num_class": len(set(y_train)),  # Number of classes
         "nthread": -1,
     }
-    num_round = 20
+    num_round = 100
 
     evallist = [(dtrain, "train"), (dvalid, "eval")]
     bst = xgb.train(params, dtrain, num_round, evallist)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py
@@ -8,6 +8,7 @@
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import KFold
+from sklearn.preprocessing import StandardScaler
 
 # Set random seed for reproducibility
 SEED = 42
@@ -40,6 +41,7 @@ def import_module_from_path(module_name, module_path):
 # Store results
 accuracies = []
 y_test_pred_l = []
+scaler = StandardScaler()
 
 # 3) Train and evaluate using KFold
 fold_number = 1
@@ -80,6 +82,11 @@ def import_module_from_path(module_name, module_path):
     X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
     X_te = pd.DataFrame(imputer.transform(X_te), columns=X_te.columns)
 
+    # Standardize the data
+    X_tr = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
+    X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
+    X_te = pd.DataFrame(scaler.transform(X_te), columns=X_te.columns)
+
     # Remove duplicate columns
     X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()]
     X_val = X_val.loc[:, ~X_val.columns.duplicated()]
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/model_xgboost.py
@@ -39,4 +39,4 @@ def predict(model, X_test):
     """
     X_test = select(X_test)
     y_pred = model.predict(X_test)
-    return y_pred
+    return y_pred.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py
@@ -85,7 +85,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 
 
 # For multiclass classification, use the mode of the predictions
-y_test_pred = np.mean(y_test_pred_l, axis=0)
+y_test_pred = np.mean(y_test_pred_l, axis=0).ravel()
 
 
 submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"])
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/model_randomforest.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)
+    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
 
     # Select features (if any feature selection is needed)
     X_train_selected = select(X_train)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
@@ -122,5 +122,5 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 y_test_pred_labels = np.where(y_test_pred == 1, "p", "e")  # 将整数转换回 'e' 或 'p'
 
 # 8) Submit predictions for the test set
-submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels})
+submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels.ravel()})
 submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py
@@ -47,8 +47,8 @@ def fit(X_train, y_train, X_valid, y_valid):
 
     # Train the model
     model.train()
-    for epoch in range(5):
-        print(f"Epoch {epoch + 1}/5")
+    for epoch in range(100):
+        print(f"Epoch {epoch + 1}/100")
         epoch_loss = 0
         for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False):
             X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move data to the device
@@ -73,4 +73,4 @@ def predict(model, X):
             batch = X_tensor[i : i + 32]  # Predict in batches
             pred = model(batch).squeeze().cpu().numpy()  # Move results back to CPU
             predictions.extend(pred)
-    return np.array(predictions)  # Return boolean predictions
+    return np.array(predictions).reshape(-1, 1)  # Return predictions
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py
@@ -51,4 +51,4 @@ def predict(model, X):
     y_pred_prob = model.predict_proba(X_selected)[:, 1]
 
     # Apply threshold to get boolean predictions
-    return y_pred_prob
+    return y_pred_prob.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v`
`23`	`23`	`"num_class": len(set(y_train)), # Number of classes`
`24`	`24`	`"nthread": -1,`
`25`	`25`	`}`
`26`		`- num_round = 20`
	`26`	`+ num_round = 100`
`27`	`27`
`28`	`28`	`evallist = [(dtrain, "train"), (dvalid, "eval")]`
`29`	`29`	`bst = xgb.train(params, dtrain, num_round, evallist)`