Skip to content

Commit dcacf02

Browse files
Add files via upload
1 parent d41ab41 commit dcacf02

6 files changed

Lines changed: 99794 additions & 22 deletions

File tree

card_classification.csv

Lines changed: 26 additions & 0 deletions
Large diffs are not rendered by default.

classification.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import csv
33
from flair.data import Sentence
44
from flair.models import SequenceTagger
5-
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings, OpenAIGPTEmbeddings
5+
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings, OpenAIGPTEmbeddings, RoBERTaEmbeddings, XLNetEmbeddings
66
import torch
77
from torch import tensor
88
import numpy as np
@@ -30,10 +30,12 @@
3030
from keras.callbacks import ModelCheckpoint
3131
from keras.wrappers.scikit_learn import KerasClassifier
3232
from keras.models import Sequential
33-
from keras.layers import Dense
33+
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, Reshape, Input, SimpleRNN, LSTM
34+
import torch.nn as nn
35+
import torch.nn.functional as F
3436

3537

36-
keras = False
38+
keras = True
3739

3840
def parse_string(a_str):
3941
to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace])
@@ -54,7 +56,8 @@ def transform(self, X):
5456
size_of_emb = list_of_embeddings[1].size
5557
if not isinstance(X, str):
5658
for doc in X:
57-
p_str = parse_string(doc)
59+
#p_str = parse_string(doc)
60+
p_str = doc
5861
if not p_str:
5962
list_of_emb.append(np.zeros((size_of_emb,), dtype=np.float32))##TODO: don't hard code vector size
6063
else:
@@ -64,7 +67,8 @@ def transform(self, X):
6467
to_ret = np.array(list_of_emb)
6568
else:
6669
try:
67-
p_str = parse_string(X)
70+
#p_str = parse_string(X)
71+
p_str = X
6872
if not p_str:
6973
to_ret = np.zeros((size_of_emb,), dtype=np.float32)##TODO here too
7074
else:
@@ -78,8 +82,9 @@ def transform(self, X):
7882

7983

8084

81-
stacked_embeddings = DocumentPoolEmbeddings([WordEmbeddings('en'),
82-
WordEmbeddings('glove'), WordEmbeddings('extvec')])
85+
stacked_embeddings = DocumentPoolEmbeddings([#WordEmbeddings('en'),
86+
#WordEmbeddings('glove'),
87+
WordEmbeddings('en-crawl')])
8388

8489
with open('card_classification.csv') as csvfile:
8590
reader = csv.reader(csvfile)
@@ -89,31 +94,37 @@ def transform(self, X):
8994
for row in reader:
9095
list_of_labels.append(row[0])
9196
parsed_string = parse_string(row[1])
97+
parsed_string = row[1]
9298
list_of_sentences.append(parsed_string)
9399
set_obj = Sentence(parsed_string)
94100
stacked_embeddings.embed(set_obj)
95101
list_of_embeddings.append(set_obj.get_embedding().cpu().detach().numpy())
96102

97103

98-
X_train, X_val, Y_train, Y_val, Emb_train, Emb_val = train_test_split(list_of_sentences, list_of_labels, list_of_embeddings, test_size = 0.30, stratify = list_of_labels, random_state=42)
99-
104+
X_train, X_val, Y_train, Y_val, Emb_train, Emb_val = train_test_split(np.asarray(list_of_sentences), np.asarray(list_of_labels), np.asarray(list_of_embeddings), test_size = 0.30, stratify = list_of_labels, random_state=42)
100105

106+
print(list_of_embeddings[1].size)
101107

102108
def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs = 5):
103109
model = Sequential()
110+
#model.add(Reshape((137, 1, 400), input_shape = (137, 400)))
111+
#model.add(Conv1D(64, 1, activation='relu'))
104112
model.add(Dense(list_of_embeddings[1].size, activation='relu',kernel_initializer='he_uniform', use_bias = False))
105-
model.add(Dense(11,activation='softmax',kernel_initializer=kernel_initializer, use_bias = False))
113+
#model.add(LSTM(list_of_embeddings[1].size, return_sequences = True,))
114+
model.add(Dense(len(np.unique(Y_val)),activation='softmax',kernel_initializer=kernel_initializer, use_bias = False))
106115
model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
107116
return model
108117

109118

110119

111120
if keras:
112121
checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)
113-
model = KerasClassifier(build_fn=create_model, batch_size = 32, epochs = 100, callbacks=[checkpointer], validation_split = 0.2)
122+
model = KerasClassifier(build_fn=create_model, batch_size = 32, epochs = 150, callbacks=[checkpointer], validation_split = 0.2)
123+
124+
114125

115126
#model = SVC(kernel = "rbf", probability = True)
116-
model = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights = 'distance')
127+
#model = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights = 'distance')
117128
#model = AdaBoostClassifier(n_estimators = 100, random_state = 42)
118129
#model = RandomForestClassifier(n_jobs = -1, n_estimators = 100, max_features = "auto", criterion = "entropy")
119130
#model = MLPClassifier(hidden_layer_sizes=(500,), activation = 'relu', solver = 'adam', verbose = True, max_iter = 100) #early_stopping = True, validation_fraction = 0.3, n_iter_no_change = 100)
@@ -138,7 +149,7 @@ def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs =
138149
probs = pipe.predict_proba(X_val)
139150
a_df = pd.DataFrame(probs, index=Y_val, columns=labels)
140151
a_df[a_df.eq(0)] = np.nan
141-
print(a_df)
152+
print(a_df.round(2))
142153

143154
if keras:
144155
pipe.named_steps['model'].model.save('keras_model.h5')

docu_learn.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@
3838
from keras.layers import Dense
3939
from keras.models import load_model
4040

41-
keras = False
41+
keras = True
4242
increment = False
4343

4444
stacked_embeddings = DocumentPoolEmbeddings([
45-
WordEmbeddings('en'),
46-
WordEmbeddings('glove'),
47-
WordEmbeddings('extvec'),#ELMoEmbeddings('original'),
45+
#WordEmbeddings('en'),
46+
#WordEmbeddings('glove'),
47+
WordEmbeddings('en-crawl'),#ELMoEmbeddings('original'),
4848
#BertEmbeddings('bert-base-cased'),
4949
#FlairEmbeddings('news-forward-fast'),
5050
#FlairEmbeddings('news-backward-fast'),
@@ -107,12 +107,12 @@ def transform(self, X):
107107
pipe.named_steps['model'].model = load_model('keras_model.h5')
108108

109109

110-
te = TextExplainer(random_state=42, n_samples=10000, position_dependent=True)
110+
te = TextExplainer(random_state=42, n_samples=3000, position_dependent=False)
111111

112112
def explain_pred(sentence):
113113
te.fit(sentence, pipe.predict_proba)
114114
#txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"]))
115-
t_pred = te.explain_prediction(top = 20, target_names=["ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP"])
115+
t_pred = te.explain_prediction(top = 20, target_names=["ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP", "ORI", "QER","COL",])
116116
txt = format_as_text(t_pred)
117117
html = format_as_html(t_pred)
118118
html_file = open("latest_prediction.html", "a+")
@@ -143,11 +143,11 @@ def print_misclass():
143143
break
144144
elif label == "stop":
145145
csvfile.close()
146-
if keras:
146+
if keras and increment:
147147
pipe.named_steps['model'].model.save('keras_model.h5')
148148
pipe.named_steps['model'].model = None
149-
joblib.dump(pipe, 'saved_card_classification.pkl')
150-
print("Model Dumped!!!!")
149+
joblib.dump(pipe, 'saved_card_classification.pkl')
150+
print("Model Dumped!!!!")
151151
done = True
152152
sys.exit()
153153
else:

keras_model.h5

1.1 MB
Binary file not shown.

0 commit comments

Comments
 (0)