|
| 1 | +import string |
| 2 | +import csv |
| 3 | +from flair.data import Sentence |
| 4 | +from flair.models import SequenceTagger |
| 5 | +from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings, OpenAIGPTEmbeddings |
| 6 | +import torch |
| 7 | +from torch import tensor |
| 8 | +import numpy as np |
| 9 | +from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, log_loss, roc_auc_score, make_scorer, balanced_accuracy_score, classification_report, confusion_matrix |
| 10 | +from sklearn.naive_bayes import BernoulliNB |
| 11 | +from sklearn.metrics.pairwise import cosine_similarity |
| 12 | +from sklearn.neighbors import KNeighborsClassifier |
| 13 | +from sklearn.svm import SVC |
| 14 | +from sklearn.ensemble import RandomForestClassifier |
| 15 | +from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_val_score |
| 16 | +from sklearn.naive_bayes import ComplementNB |
| 17 | +from sklearn.neural_network import MLPClassifier |
| 18 | +from time import time |
| 19 | +import pickle |
| 20 | +import umap |
| 21 | +from sklearn.pipeline import make_union, Pipeline |
| 22 | +from sklearn.base import TransformerMixin, BaseEstimator |
| 23 | +#from sklearn.pipeline import Pipeline, make_pipeline |
| 24 | +import eli5 |
| 25 | +from eli5.lime import TextExplainer |
| 26 | +from eli5 import explain_prediction |
| 27 | +from eli5.formatters import format_as_text |
| 28 | +import pandas as pd |
| 29 | +from sklearn.externals import joblib |
| 30 | + |
| 31 | + |
| 32 | +def parse_string(a_str): |
| 33 | + to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace]) |
| 34 | + to_ret2 = to_ret.split() |
| 35 | + to_ret3 = " ".join(to_ret2) |
| 36 | + return to_ret3 |
| 37 | + |
| 38 | +class Text2Vec( BaseEstimator, TransformerMixin): |
| 39 | + ''' |
| 40 | + def __init__(): |
| 41 | + self.X = None |
| 42 | + ''' |
| 43 | + def fit(self, X, y=None): |
| 44 | + return self |
| 45 | + |
| 46 | + def transform(self, X): |
| 47 | + list_of_emb = [] |
| 48 | + size_of_emb = list_of_embeddings[1].size |
| 49 | + if not isinstance(X, str): |
| 50 | + for doc in X: |
| 51 | + p_str = parse_string(doc) |
| 52 | + if not p_str: |
| 53 | + list_of_emb.append(np.zeros((size_of_emb,), dtype=np.float32))##TODO: don't hard code vector size |
| 54 | + else: |
| 55 | + a_set = Sentence(p_str) |
| 56 | + stacked_embeddings.embed(a_set) |
| 57 | + list_of_emb.append(a_set.get_embedding().detach().numpy()) |
| 58 | + to_ret = np.array(list_of_emb) |
| 59 | + else: |
| 60 | + try: |
| 61 | + p_str = parse_string(X) |
| 62 | + if not p_str: |
| 63 | + to_ret = np.zeros((size_of_emb,), dtype=np.float32)##TODO here too |
| 64 | + else: |
| 65 | + a_set = Sentence(p_str) |
| 66 | + stacked_embeddings.embed(a_set) |
| 67 | + to_ret = a_set.get_embedding().detach().numpy().reshape(1, -1) |
| 68 | + except: |
| 69 | + print(type(X)) |
| 70 | + print(X) |
| 71 | + return to_ret |
| 72 | + |
| 73 | + |
| 74 | + |
| 75 | +stacked_embeddings = DocumentPoolEmbeddings([WordEmbeddings('en'), |
| 76 | + WordEmbeddings('glove'),]) |
| 77 | + |
| 78 | +with open('card_classification.csv') as csvfile: |
| 79 | + reader = csv.reader(csvfile) |
| 80 | + list_of_sentences = [] |
| 81 | + list_of_labels = [] |
| 82 | + list_of_embeddings = [] |
| 83 | + for row in reader: |
| 84 | + list_of_labels.append(row[0]) |
| 85 | + parsed_string = parse_string(row[1]) |
| 86 | + list_of_sentences.append(parsed_string) |
| 87 | + set_obj = Sentence(parsed_string) |
| 88 | + stacked_embeddings.embed(set_obj) |
| 89 | + list_of_embeddings.append(set_obj.get_embedding().detach().numpy()) |
| 90 | + |
| 91 | + |
| 92 | +X_train, X_val, Y_train, Y_val, Emb_train, Emb_val = train_test_split(list_of_sentences, list_of_labels, list_of_embeddings, test_size = 0.30, stratify = list_of_labels, random_state=42) |
| 93 | + |
| 94 | + |
| 95 | +#model = SVC(kernel = "rbf", probability = True) |
| 96 | +#model = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights = 'distance') |
| 97 | +#model = AdaBoostClassifier(n_estimators = 100, random_state = 42) |
| 98 | +#model = RandomForestClassifier(n_jobs = -1, n_estimators = 100, max_features = "auto", criterion = "entropy") |
| 99 | +model = MLPClassifier(hidden_layer_sizes=(500,), activation = 'relu', solver = 'adam', verbose = True, max_iter = 100) #early_stopping = True, validation_fraction = 0.3, n_iter_no_change = 100) |
| 100 | +pipe = Pipeline([('text2vec', Text2Vec()), ('model', model)]) |
| 101 | +#model.fit(Emb_train, Y_train) |
| 102 | +pipe.fit(X_train, Y_train) |
| 103 | + |
| 104 | +pred = pipe.predict(X_val) |
| 105 | + |
| 106 | +print(accuracy_score(Y_val, pred)) |
| 107 | + |
| 108 | +labels = np.unique(Y_val) |
| 109 | +conf = confusion_matrix(Y_val, pred, labels=labels) |
| 110 | + |
| 111 | +print(pd.DataFrame(conf, index=labels, columns=labels)) |
| 112 | + |
| 113 | +probs = pipe.predict_proba(X_val) |
| 114 | +a_df = pd.DataFrame(probs, index=Y_val, columns=labels) |
| 115 | +a_df[a_df.eq(0)] = np.nan |
| 116 | +print(a_df) |
| 117 | + |
| 118 | +joblib.dump(pipe, 'saved_card_classification.pkl') |
| 119 | +print("Model Dumped!!!!") |
0 commit comments