-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlgb_tfidf.py
More file actions
95 lines (74 loc) · 2.89 KB
/
lgb_tfidf.py
File metadata and controls
95 lines (74 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import argparse
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from utils.data import load_dataset, get_weight
from utils.features import tfidf
from utils.metrics import metric_f1
from utils.observer import get_current_time
from utils.utils import fix_seed
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--submit', type=str, default="False")
args = parser.parse_args()
args.submit = (args.submit == 'True')
return args
args = parse_args()
if __name__ == "__main__":
# hyper params
seed=1; fix_seed(seed)
max_features = None
n_folds = 5
num_boost_round = 1000
early_stopping_rounds = 100
# data
train_df, test_df, sample_submit_df = load_dataset()
X, X_test = tfidf(train_df, test_df, max_features=max_features)
y = train_df['jobflag'].astype(int)-1
weight = get_weight(train_df)
# ---------- Kfold ---------- #
scores = []
y_test_pred = np.zeros((X_test.shape[0], y.nunique()), dtype='float32')
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
for i, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
train_data = lgb.Dataset(X_train, label=y_train, weight=weight[train_idx])
valid_data = lgb.Dataset(X_valid, label=y_valid, weight=weight[valid_idx])
params = {
'objective': 'multiclass',
'num_class': 4,
'metric': 'None',
'verbose': -1,
'seed': seed
}
# train
model = lgb.train(params,
train_data,
valid_sets=[train_data,valid_data],
num_boost_round=num_boost_round,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=10,
feval=metric_f1)
# evaluate
y_val_pred = np.argmax(model.predict(X_valid), axis=1)
score = f1_score(y_valid, y_val_pred, average='macro')
scores.append(score)
print(f"\nFold-{i+1}: Score: {score:.4f}\n")
# predict test
y_test_pred += model.predict(X_test, num_iteration=model.best_iteration) / n_folds
# evaluate
print(f"Kfold F1 Score: {np.mean(scores):.4f}")
# submit
if args.submit:
pred = np.argmax(y_test_pred, axis=1)+1
submit = pd.DataFrame({'index':test_df['id'], 'pred':pred})
model_name = "lgb_tfidf"
current_time = get_current_time()
filename = f"{current_time}_{model_name}.csv"
filepath = os.path.join('submits', filename)
submit.to_csv(filepath, index=False, header=False)
print(f'Save {filepath}')