Skip to content

Commit cb52258

Browse files
Add files via upload
1 parent f4f1796 commit cb52258

5 files changed

Lines changed: 15297 additions & 19 deletions

File tree

classification.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,14 @@
2727
from eli5.formatters import format_as_text
2828
import pandas as pd
2929
from sklearn.externals import joblib
30+
from keras.callbacks import ModelCheckpoint
31+
from keras.wrappers.scikit_learn import KerasClassifier
32+
from keras.models import Sequential
33+
from keras.layers import Dense
3034

3135

36+
keras = False
37+
3238
def parse_string(a_str):
3339
to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace])
3440
to_ret2 = to_ret.split()
@@ -54,7 +60,7 @@ def transform(self, X):
5460
else:
5561
a_set = Sentence(p_str)
5662
stacked_embeddings.embed(a_set)
57-
list_of_emb.append(a_set.get_embedding().detach().numpy())
63+
list_of_emb.append(a_set.get_embedding().cpu().detach().numpy())
5864
to_ret = np.array(list_of_emb)
5965
else:
6066
try:
@@ -64,7 +70,7 @@ def transform(self, X):
6470
else:
6571
a_set = Sentence(p_str)
6672
stacked_embeddings.embed(a_set)
67-
to_ret = a_set.get_embedding().detach().numpy().reshape(1, -1)
73+
to_ret = a_set.get_embedding().cpu().detach().numpy().reshape(1, -1)
6874
except:
6975
print(type(X))
7076
print(X)
@@ -73,7 +79,7 @@ def transform(self, X):
7379

7480

7581
stacked_embeddings = DocumentPoolEmbeddings([WordEmbeddings('en'),
76-
WordEmbeddings('glove'),])
82+
WordEmbeddings('glove'), WordEmbeddings('extvec')])
7783

7884
with open('card_classification.csv') as csvfile:
7985
reader = csv.reader(csvfile)
@@ -86,17 +92,36 @@ def transform(self, X):
8692
list_of_sentences.append(parsed_string)
8793
set_obj = Sentence(parsed_string)
8894
stacked_embeddings.embed(set_obj)
89-
list_of_embeddings.append(set_obj.get_embedding().detach().numpy())
95+
list_of_embeddings.append(set_obj.get_embedding().cpu().detach().numpy())
9096

9197

9298
X_train, X_val, Y_train, Y_val, Emb_train, Emb_val = train_test_split(list_of_sentences, list_of_labels, list_of_embeddings, test_size = 0.30, stratify = list_of_labels, random_state=42)
9399

94100

101+
102+
def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs = 5):
103+
model = Sequential()
104+
model.add(Dense(list_of_embeddings[1].size, activation='relu',kernel_initializer='he_uniform', use_bias = False))
105+
model.add(Dense(11,activation='softmax',kernel_initializer=kernel_initializer, use_bias = False))
106+
model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
107+
return model
108+
109+
110+
111+
if keras:
112+
checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)
113+
model = KerasClassifier(build_fn=create_model, batch_size = 32, epochs = 100, callbacks=[checkpointer], validation_split = 0.2)
114+
95115
#model = SVC(kernel = "rbf", probability = True)
96-
#model = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights = 'distance')
116+
model = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights = 'distance')
97117
#model = AdaBoostClassifier(n_estimators = 100, random_state = 42)
98118
#model = RandomForestClassifier(n_jobs = -1, n_estimators = 100, max_features = "auto", criterion = "entropy")
99-
model = MLPClassifier(hidden_layer_sizes=(500,), activation = 'relu', solver = 'adam', verbose = True, max_iter = 100) #early_stopping = True, validation_fraction = 0.3, n_iter_no_change = 100)
119+
#model = MLPClassifier(hidden_layer_sizes=(500,), activation = 'relu', solver = 'adam', verbose = True, max_iter = 100) #early_stopping = True, validation_fraction = 0.3, n_iter_no_change = 100)
120+
121+
122+
123+
124+
100125
pipe = Pipeline([('text2vec', Text2Vec()), ('model', model)])
101126
#model.fit(Emb_train, Y_train)
102127
pipe.fit(X_train, Y_train)
@@ -115,5 +140,8 @@ def transform(self, X):
115140
a_df[a_df.eq(0)] = np.nan
116141
print(a_df)
117142

143+
if keras:
144+
pipe.named_steps['model'].model.save('keras_model.h5')
145+
pipe.named_steps['model'].model = None
118146
joblib.dump(pipe, 'saved_card_classification.pkl')
119147
print("Model Dumped!!!!")

docu_learn.py

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,35 @@
2929
import eli5
3030
from eli5.lime import TextExplainer
3131
from eli5 import explain_prediction
32-
from eli5.formatters import format_as_text
32+
from eli5.formatters import format_as_text, format_as_html
3333
import pandas as pd
34+
from IPython.display import display
35+
from keras.callbacks import ModelCheckpoint
36+
from keras.wrappers.scikit_learn import KerasClassifier
37+
from keras.models import Sequential
38+
from keras.layers import Dense
39+
from keras.models import load_model
3440

35-
41+
keras = False
3642
increment = False
3743

3844
stacked_embeddings = DocumentPoolEmbeddings([
3945
WordEmbeddings('en'),
4046
WordEmbeddings('glove'),
41-
#WordEmbeddings('extvec'),#ELMoEmbeddings('original'),
47+
WordEmbeddings('extvec'),#ELMoEmbeddings('original'),
4248
#BertEmbeddings('bert-base-cased'),
4349
#FlairEmbeddings('news-forward-fast'),
4450
#FlairEmbeddings('news-backward-fast'),
4551
]) #, mode='max')
4652

53+
def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs = 5):
54+
model = Sequential()
55+
model.add(Dense(list_of_embeddings[1].size, activation='relu',kernel_initializer='he_uniform', use_bias = True))
56+
model.add(Dense(11,activation='softmax',kernel_initializer=kernel_initializer, use_bias = True))
57+
model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
58+
return model
59+
60+
4761
def parse_string(a_str):
4862
to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace])
4963
to_ret2 = to_ret.split()
@@ -69,7 +83,7 @@ def transform(self, X):
6983
else:
7084
a_set = Sentence(p_str)
7185
stacked_embeddings.embed(a_set)
72-
list_of_emb.append(a_set.get_embedding().detach().numpy())
86+
list_of_emb.append(a_set.get_embedding().cpu().detach().numpy())
7387
to_ret = np.array(list_of_emb)
7488
else:
7589
try:
@@ -79,7 +93,7 @@ def transform(self, X):
7993
else:
8094
a_set = Sentence(p_str)
8195
stacked_embeddings.embed(a_set)
82-
to_ret = a_set.get_embedding().detach().numpy().reshape(1, -1)
96+
to_ret = a_set.get_embedding().cpu().detach().numpy().reshape(1, -1)
8397
except:
8498
print(type(X))
8599
print(X)
@@ -89,18 +103,23 @@ def transform(self, X):
89103

90104
pipe = joblib.load('saved_card_classification.pkl')
91105

92-
te = TextExplainer(random_state=42, n_samples=1000, position_dependent=True)
106+
if keras:
107+
pipe.named_steps['model'].model = load_model('keras_model.h5')
108+
109+
110+
te = TextExplainer(random_state=42, n_samples=10000, position_dependent=True)
93111

94112
def explain_pred(sentence):
95113
te.fit(sentence, pipe.predict_proba)
96114
#txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"]))
97-
txt = format_as_text(te.explain_prediction(top = 20, target_names=["ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP"]))
98-
print(txt)
115+
t_pred = te.explain_prediction(top = 20, target_names=["ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP"])
116+
txt = format_as_text(t_pred)
117+
html = format_as_html(t_pred)
118+
html_file = open("latest_prediction.html", "a+")
119+
html_file.write(html)
120+
html_file.close()
99121
print(te.metrics_)
100122

101-
def direct_explain_pred(sentence):
102-
txt = format_as_text(eli5.explain_prediction(model, doc=sentence, target_names=["green", "neutral", "red"], vec=Text2Vec())) #get vector importances
103-
print(txt)
104123

105124
def print_misclass():
106125
print("misclassified examples!!!")
@@ -124,6 +143,9 @@ def print_misclass():
124143
break
125144
elif label == "stop":
126145
csvfile.close()
146+
if keras:
147+
pipe.named_steps['model'].model.save('keras_model.h5')
148+
pipe.named_steps['model'].model = None
127149
joblib.dump(pipe, 'saved_card_classification.pkl')
128150
print("Model Dumped!!!!")
129151
done = True
@@ -134,8 +156,8 @@ def print_misclass():
134156
t_model = pipe.named_steps['model']
135157
ppset = Sentence(str(to_process))
136158
stacked_embeddings.embed(ppset)
137-
the_emb = ppset.get_embedding().detach().numpy().reshape(1, -1)
159+
the_emb = ppset.get_embedding().cpu().detach().numpy().reshape(1, -1)
138160
t_model.partial_fit(the_emb, the_labels) ##INCREMENTAL LEARNING MODE ENGAGED
139161
the_labels.append(str(to_process))
140162
spamwriter.writerow(the_labels)
141-
csvfile.flush()
163+
csvfile.flush()

explaination2.png

627 KB
Loading

explaination3.png

423 KB
Loading

0 commit comments

Comments
 (0)