Skip to content

Commit 8c57524

Browse files
authored
feat: add s3e11 kaggle template (microsoft#324)
* s3e11 tpl v1 * some changes * fix some bugs in s3e11 tpl, change docker logs color * fix CI
1 parent 91979c0 commit 8c57524

File tree

5 files changed

+199
-1
lines changed

5 files changed

+199
-1
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import os
2+
3+
import numpy as np # linear algebra
4+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
5+
from sklearn.model_selection import train_test_split
6+
7+
8+
def preprocess_script():
9+
"""
10+
This method applies the preprocessing steps to the training, validation, and test datasets.
11+
"""
12+
if os.path.exists("/kaggle/input/X_train.pkl"):
13+
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
14+
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
15+
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
16+
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
17+
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
18+
others = pd.read_pickle("/kaggle/input/others.pkl")
19+
20+
return X_train, X_valid, y_train, y_valid, X_test, *others
21+
22+
# train
23+
train = pd.read_csv("/kaggle/input/train.csv")
24+
train["store_sqft"] = train["store_sqft"].astype("category")
25+
train["salad"] = (train["salad_bar"] + train["prepared_food"]) / 2
26+
train["log_cost"] = np.log1p(train["cost"])
27+
most_important_features = [
28+
"total_children",
29+
"num_children_at_home",
30+
"avg_cars_at home(approx).1",
31+
"store_sqft",
32+
"coffee_bar",
33+
"video_store",
34+
"salad",
35+
"florist",
36+
]
37+
38+
X_train, X_valid, y_train, y_valid = train_test_split(
39+
train[most_important_features], train["log_cost"], test_size=0.2, random_state=2023
40+
)
41+
42+
# test
43+
test = pd.read_csv("/kaggle/input/test.csv")
44+
test["store_sqft"] = test["store_sqft"].astype("category")
45+
test["salad"] = (test["salad_bar"] + test["prepared_food"]) / 2
46+
47+
ids = test["id"]
48+
X_test = test.drop(["id"], axis=1)
49+
X_test = X_test[most_important_features]
50+
51+
return X_train, X_valid, y_train, y_valid, X_test, ids
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pandas as pd
2+
3+
"""
4+
Here is the feature engineering code for each task, with a class that has a fit and transform method.
5+
Remember
6+
"""
7+
8+
9+
class IdentityFeature:
10+
def fit(self, train_df: pd.DataFrame):
11+
"""
12+
Fit the feature engineering model to the training data.
13+
"""
14+
pass
15+
16+
def transform(self, X: pd.DataFrame):
17+
"""
18+
Transform the input data.
19+
"""
20+
return X
21+
22+
23+
feature_engineering_cls = IdentityFeature
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
motivation of the model
3+
"""
4+
5+
import pandas as pd
6+
import xgboost as xgb
7+
8+
9+
def select(X: pd.DataFrame) -> pd.DataFrame:
10+
# Ignore feature selection logic
11+
return X
12+
13+
14+
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
15+
"""Define and train the model. Merge feature_select"""
16+
X_train = select(X_train)
17+
18+
xgb_params = {
19+
"n_estimators": 280,
20+
"learning_rate": 0.05,
21+
"max_depth": 10,
22+
"subsample": 1.0,
23+
"colsample_bytree": 1.0,
24+
"tree_method": "hist",
25+
"enable_categorical": True,
26+
"verbosity": 1,
27+
"min_child_weight": 3,
28+
"base_score": 4.6,
29+
"random_state": 2023,
30+
}
31+
model = xgb.XGBRegressor(**xgb_params)
32+
model.fit(X_train, y_train)
33+
return model
34+
35+
36+
def predict(model, X_test):
37+
"""
38+
Keep feature select's consistency.
39+
"""
40+
X_test = select(X_test)
41+
y_pred = model.predict(X_test)
42+
return y_pred
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import importlib.util
2+
from pathlib import Path
3+
4+
import numpy as np
5+
import pandas as pd
6+
from fea_share_preprocess import preprocess_script
7+
from sklearn.metrics import mean_squared_error
8+
9+
DIRNAME = Path(__file__).absolute().resolve().parent
10+
11+
12+
def import_module_from_path(module_name, module_path):
13+
spec = importlib.util.spec_from_file_location(module_name, module_path)
14+
module = importlib.util.module_from_spec(spec)
15+
spec.loader.exec_module(module)
16+
return module
17+
18+
19+
# 1) Preprocess the data
20+
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
21+
22+
# 2) Auto feature engineering
23+
X_train_l, X_valid_l = [], []
24+
X_test_l = []
25+
26+
for f in DIRNAME.glob("feature/feat*.py"):
27+
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
28+
cls.fit(X_train)
29+
X_train_f = cls.transform(X_train)
30+
X_valid_f = cls.transform(X_valid)
31+
X_test_f = cls.transform(X_test)
32+
33+
X_train_l.append(X_train_f)
34+
X_valid_l.append(X_valid_f)
35+
X_test_l.append(X_test_f)
36+
37+
X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
38+
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
39+
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
40+
41+
# 3) Train the model
42+
model_l = [] # list[tuple[model, predict_func]]
43+
for f in DIRNAME.glob("model/model*.py"):
44+
m = import_module_from_path(f.stem, f)
45+
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
46+
47+
# 4) Evaluate the model on the validation set
48+
y_valid_pred_l = []
49+
for model, predict_func in model_l:
50+
y_valid_pred = predict_func(model, X_valid)
51+
y_valid_pred_l.append(y_valid_pred)
52+
# print(y_valid_pred)
53+
# print(y_valid_pred.shape)
54+
55+
# 5) Ensemble
56+
# Majority vote ensemble
57+
y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)
58+
59+
60+
# 6) Save the validation metrics
61+
metrics = mean_squared_error(y_valid, y_valid_pred_ensemble, squared=False)
62+
print(f"RMLSE on valid set: {metrics}")
63+
pd.Series(data=[metrics], index=["RMLSE"]).to_csv("submission_score.csv")
64+
65+
# 7) Make predictions on the test set and save them
66+
y_test_pred_l = []
67+
for model, predict_func in model_l:
68+
y_test_pred_l.append(predict_func(model, X_test))
69+
70+
71+
# For multiclass classification, use the mode of the predictions
72+
y_test_pred = np.mean(y_test_pred_l, axis=0)
73+
74+
75+
submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"])
76+
submission_result.insert(0, "id", ids)
77+
78+
submission_result.to_csv("submission.csv", index=False)

rdagent/utils/env.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
import docker.models.containers
2424
from pydantic import BaseModel
2525
from pydantic_settings import BaseSettings
26+
from rich import print
2627
from rich.progress import Progress, TextColumn
28+
from rich.rule import Rule
2729

2830
from rdagent.log import rdagent_logger as logger
2931

@@ -207,7 +209,7 @@ def prepare(self):
207209
status_dict = json.loads(part)
208210
if "error" in status_dict:
209211
p.update(task, description=f"[red]error: {status_dict['error']}")
210-
raise docker.errors.BuildError(status_dict["error"])
212+
raise docker.errors.BuildError(status_dict["error"], "")
211213
if "stream" in status_dict:
212214
p.update(task, description=status_dict["stream"])
213215
logger.info(f"Finished building the image from dockerfile: {self.conf.dockerfile_folder_path}")
@@ -305,10 +307,12 @@ def run(
305307
**self._gpu_kwargs(client),
306308
)
307309
logs = container.logs(stream=True)
310+
print(Rule("[bold green]Docker Logs Begin[/bold green]", style="dark_orange"))
308311
for log in logs:
309312
decoded_log = log.strip().decode()
310313
print(decoded_log)
311314
log_output += decoded_log + "\n"
315+
print(Rule("[bold green]Docker Logs End[/bold green]", style="dark_orange"))
312316
container.wait()
313317
container.stop()
314318
container.remove()

0 commit comments

Comments
 (0)