Skip to content

Commit bb99346

Browse files
Add files via upload
1 parent a3b23b6 commit bb99346

3 files changed

Lines changed: 453 additions & 0 deletions

File tree

card_classification.csv

Lines changed: 193 additions & 0 deletions
Large diffs are not rendered by default.

classification.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import string
2+
import csv
3+
from flair.data import Sentence
4+
from flair.models import SequenceTagger
5+
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings, OpenAIGPTEmbeddings
6+
import torch
7+
from torch import tensor
8+
import numpy as np
9+
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, log_loss, roc_auc_score, make_scorer, balanced_accuracy_score, classification_report, confusion_matrix
10+
from sklearn.naive_bayes import BernoulliNB
11+
from sklearn.metrics.pairwise import cosine_similarity
12+
from sklearn.neighbors import KNeighborsClassifier
13+
from sklearn.svm import SVC
14+
from sklearn.ensemble import RandomForestClassifier
15+
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_val_score
16+
from sklearn.naive_bayes import ComplementNB
17+
from sklearn.neural_network import MLPClassifier
18+
from time import time
19+
import pickle
20+
import umap
21+
from sklearn.pipeline import make_union, Pipeline
22+
from sklearn.base import TransformerMixin, BaseEstimator
23+
#from sklearn.pipeline import Pipeline, make_pipeline
24+
import eli5
25+
from eli5.lime import TextExplainer
26+
from eli5 import explain_prediction
27+
from eli5.formatters import format_as_text
28+
import pandas as pd
29+
from sklearn.externals import joblib
30+
31+
32+
def parse_string(a_str):
33+
to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace])
34+
to_ret2 = to_ret.split()
35+
to_ret3 = " ".join(to_ret2)
36+
return to_ret3
37+
38+
class Text2Vec( BaseEstimator, TransformerMixin):
39+
'''
40+
def __init__():
41+
self.X = None
42+
'''
43+
def fit(self, X, y=None):
44+
return self
45+
46+
def transform(self, X):
47+
list_of_emb = []
48+
size_of_emb = list_of_embeddings[1].size
49+
if not isinstance(X, str):
50+
for doc in X:
51+
p_str = parse_string(doc)
52+
if not p_str:
53+
list_of_emb.append(np.zeros((size_of_emb,), dtype=np.float32))##TODO: don't hard code vector size
54+
else:
55+
a_set = Sentence(p_str)
56+
stacked_embeddings.embed(a_set)
57+
list_of_emb.append(a_set.get_embedding().detach().numpy())
58+
to_ret = np.array(list_of_emb)
59+
else:
60+
try:
61+
p_str = parse_string(X)
62+
if not p_str:
63+
to_ret = np.zeros((size_of_emb,), dtype=np.float32)##TODO here too
64+
else:
65+
a_set = Sentence(p_str)
66+
stacked_embeddings.embed(a_set)
67+
to_ret = a_set.get_embedding().detach().numpy().reshape(1, -1)
68+
except:
69+
print(type(X))
70+
print(X)
71+
return to_ret
72+
73+
74+
75+
stacked_embeddings = DocumentPoolEmbeddings([WordEmbeddings('en'),
76+
WordEmbeddings('glove'),])
77+
78+
with open('card_classification.csv') as csvfile:
79+
reader = csv.reader(csvfile)
80+
list_of_sentences = []
81+
list_of_labels = []
82+
list_of_embeddings = []
83+
for row in reader:
84+
list_of_labels.append(row[0])
85+
parsed_string = parse_string(row[1])
86+
list_of_sentences.append(parsed_string)
87+
set_obj = Sentence(parsed_string)
88+
stacked_embeddings.embed(set_obj)
89+
list_of_embeddings.append(set_obj.get_embedding().detach().numpy())
90+
91+
92+
X_train, X_val, Y_train, Y_val, Emb_train, Emb_val = train_test_split(list_of_sentences, list_of_labels, list_of_embeddings, test_size = 0.30, stratify = list_of_labels, random_state=42)
93+
94+
95+
#model = SVC(kernel = "rbf", probability = True)
96+
#model = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights = 'distance')
97+
#model = AdaBoostClassifier(n_estimators = 100, random_state = 42)
98+
#model = RandomForestClassifier(n_jobs = -1, n_estimators = 100, max_features = "auto", criterion = "entropy")
99+
model = MLPClassifier(hidden_layer_sizes=(500,), activation = 'relu', solver = 'adam', verbose = True, max_iter = 100) #early_stopping = True, validation_fraction = 0.3, n_iter_no_change = 100)
100+
pipe = Pipeline([('text2vec', Text2Vec()), ('model', model)])
101+
#model.fit(Emb_train, Y_train)
102+
pipe.fit(X_train, Y_train)
103+
104+
pred = pipe.predict(X_val)
105+
106+
print(accuracy_score(Y_val, pred))
107+
108+
labels = np.unique(Y_val)
109+
conf = confusion_matrix(Y_val, pred, labels=labels)
110+
111+
print(pd.DataFrame(conf, index=labels, columns=labels))
112+
113+
probs = pipe.predict_proba(X_val)
114+
a_df = pd.DataFrame(probs, index=Y_val, columns=labels)
115+
a_df[a_df.eq(0)] = np.nan
116+
print(a_df)
117+
118+
joblib.dump(pipe, 'saved_card_classification.pkl')
119+
print("Model Dumped!!!!")

docu_learn.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import glob
2+
import os
3+
from bs4 import BeautifulSoup
4+
import bs4
5+
import string
6+
import flair
7+
from flair.data import Sentence
8+
from flair.models import SequenceTagger
9+
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings
10+
import torch
11+
# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
12+
from sklearn.metrics.pairwise import cosine_similarity
13+
from sklearn.metrics import jaccard_similarity_score
14+
#import numpy as np
15+
from docx import Document
16+
import sys
17+
import numpy as np
18+
from itertools import islice
19+
from collections import deque
20+
import csv
21+
from random import shuffle
22+
from sklearn.externals import joblib
23+
from time import time
24+
import pickle
25+
import umap
26+
from sklearn.pipeline import make_union, Pipeline
27+
from sklearn.base import TransformerMixin, BaseEstimator
28+
#from sklearn.pipeline import Pipeline, make_pipeline
29+
import eli5
30+
from eli5.lime import TextExplainer
31+
from eli5 import explain_prediction
32+
from eli5.formatters import format_as_text
33+
import pandas as pd
34+
35+
36+
increment = False
37+
38+
stacked_embeddings = DocumentPoolEmbeddings([
39+
WordEmbeddings('en'),
40+
WordEmbeddings('glove'),
41+
#WordEmbeddings('extvec'),#ELMoEmbeddings('original'),
42+
#BertEmbeddings('bert-base-cased'),
43+
#FlairEmbeddings('news-forward-fast'),
44+
#FlairEmbeddings('news-backward-fast'),
45+
]) #, mode='max')
46+
47+
def parse_string(a_str):
48+
to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace])
49+
to_ret2 = to_ret.split()
50+
to_ret3 = " ".join(to_ret2)
51+
return to_ret3
52+
53+
class Text2Vec( BaseEstimator, TransformerMixin):
54+
'''
55+
def __init__():
56+
self.X = None
57+
'''
58+
def fit(self, X, y=None):
59+
return self
60+
61+
def transform(self, X):
62+
list_of_emb = []
63+
size_of_emb = stacked_embeddings.embedding_length
64+
if not isinstance(X, str):
65+
for doc in X:
66+
p_str = parse_string(doc)
67+
if not p_str:
68+
list_of_emb.append(np.zeros((size_of_emb,), dtype=np.float32))##TODO: don't hard code vector size
69+
else:
70+
a_set = Sentence(p_str)
71+
stacked_embeddings.embed(a_set)
72+
list_of_emb.append(a_set.get_embedding().detach().numpy())
73+
to_ret = np.array(list_of_emb)
74+
else:
75+
try:
76+
p_str = parse_string(X)
77+
if not p_str:
78+
to_ret = np.zeros((size_of_emb,), dtype=np.float32)##TODO here too
79+
else:
80+
a_set = Sentence(p_str)
81+
stacked_embeddings.embed(a_set)
82+
to_ret = a_set.get_embedding().detach().numpy().reshape(1, -1)
83+
except:
84+
print(type(X))
85+
print(X)
86+
return to_ret
87+
88+
89+
90+
pipe = joblib.load('saved_card_classification.pkl')
91+
92+
te = TextExplainer(random_state=42, n_samples=1000, position_dependent=True)
93+
94+
def explain_pred(sentence):
95+
te.fit(sentence, pipe.predict_proba)
96+
#txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"]))
97+
txt = format_as_text(te.explain_prediction(top = 20, target_names=["ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP"]))
98+
print(txt)
99+
print(te.metrics_)
100+
101+
def direct_explain_pred(sentence):
102+
txt = format_as_text(eli5.explain_prediction(model, doc=sentence, target_names=["green", "neutral", "red"], vec=Text2Vec())) #get vector importances
103+
print(txt)
104+
105+
def print_misclass():
106+
print("misclassified examples!!!")
107+
print(np.where(Y_val != pipe.predict(X_val)))
108+
109+
110+
111+
with open('card_classification.csv', 'a') as csvfile:
112+
spamwriter = csv.writer(csvfile)
113+
done = False
114+
while not done:
115+
to_process = input("Please copy and paste a document to be classified Ctrl-shift-D or ctrl-D to exit")
116+
print("MODEL PREDICTION:")
117+
pred = pipe.predict(str(to_process))
118+
print(pred)
119+
explain_pred(str(to_process))
120+
label = input("What is the ground truth label of this? Seperate labels with a space")
121+
if label == "":
122+
pass
123+
elif label == "f":
124+
break
125+
elif label == "stop":
126+
csvfile.close()
127+
joblib.dump(pipe, 'saved_card_classification.pkl')
128+
print("Model Dumped!!!!")
129+
done = True
130+
sys.exit()
131+
else:
132+
the_labels = label.split()
133+
if increment == True:
134+
t_model = pipe.named_steps['model']
135+
ppset = Sentence(str(to_process))
136+
stacked_embeddings.embed(ppset)
137+
the_emb = ppset.get_embedding().detach().numpy().reshape(1, -1)
138+
t_model.partial_fit(the_emb, the_labels) ##INCREMENTAL LEARNING MODE ENGAGED
139+
the_labels.append(str(to_process))
140+
spamwriter.writerow(the_labels)
141+
csvfile.flush()

0 commit comments

Comments
 (0)