22import csv
33from flair .data import Sentence
44from flair .models import SequenceTagger
5- from flair .embeddings import WordEmbeddings , FlairEmbeddings , StackedEmbeddings , DocumentPoolEmbeddings , BertEmbeddings , ELMoEmbeddings , OpenAIGPTEmbeddings , RoBERTaEmbeddings , XLNetEmbeddings
5+ from flair .embeddings import WordEmbeddings , FlairEmbeddings , StackedEmbeddings , DocumentPoolEmbeddings , BertEmbeddings , ELMoEmbeddings , OpenAIGPTEmbeddings , RoBERTaEmbeddings , XLNetEmbeddings , BytePairEmbeddings
66import torch
77from torch import tensor
88import numpy as np
@@ -43,6 +43,11 @@ def parse_string(a_str):
4343 to_ret3 = " " .join (to_ret2 )
4444 return to_ret3
4545
46+
47+ def get_misclass ():
48+ print ("misclassified examples!!!" )
49+ return np .where (Y_val != pipe .predict (X_val ))
50+
4651class Text2Vec ( BaseEstimator , TransformerMixin ):
4752 '''
4853 def __init__():
@@ -82,9 +87,11 @@ def transform(self, X):
8287
8388
8489
85- stacked_embeddings = DocumentPoolEmbeddings ([# WordEmbeddings('en'),
90+ stacked_embeddings = DocumentPoolEmbeddings ([WordEmbeddings ('en' ),
8691 #WordEmbeddings('glove'),
87- WordEmbeddings ('en-crawl' )])
92+ #WordEmbeddings('en-crawl',
93+ #BytePairEmbeddings('en', 300),
94+ ])
8895
8996with open ('card_classification.csv' ) as csvfile :
9097 reader = csv .reader (csvfile )
@@ -109,10 +116,10 @@ def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs =
109116 model = Sequential ()
110117 #model.add(InputLayer(input_shape=(153, 1, 300)))
111118 #print(model.output_shape)
112- model .add (Reshape ((1 , list_of_embeddings [1 ].size ), input_shape = Emb_train .shape [1 :])) ##magical fucking stupid keras BS
119+ model .add (Reshape ((1 , list_of_embeddings [1 ].size ), input_shape = Emb_train .shape [1 :])) ##magical fucking stupid keras BS needed for RNN/CNN
113120 #print(model.output_shape)
114- #model.add(Conv1D(filters=12 , kernel_size=1, activation='relu')) ##works now
115- model .add (GRU (list_of_embeddings [1 ].size ))
121+ #model.add(Conv1D(filters=20 , kernel_size=1, activation='relu')) ##works now
122+ model .add (GRU (list_of_embeddings [1 ].size )) ##this works too - seems to be better for smaller datasets too!
116123 #print(model.output_shape)
117124 #model.add(Flatten())
118125 #model.add(Dense(list_of_embeddings[1].size, activation='relu',kernel_initializer='he_uniform', use_bias = False))
@@ -125,7 +132,7 @@ def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs =
125132
126133if keras :
127134 checkpointer = ModelCheckpoint (filepath = '/tmp/weights.hdf5' , verbose = 1 , save_best_only = True )
128- model = KerasClassifier (build_fn = create_model , batch_size = 32 , epochs = 150 , callbacks = [checkpointer ], validation_split = 0.2 )
135+ model = KerasClassifier (build_fn = create_model , batch_size = 32 , epochs = 50 , callbacks = [checkpointer ], validation_split = 0.2 )
129136
130137
131138
@@ -152,11 +159,15 @@ def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs =
152159
153160print (pd .DataFrame (conf , index = labels , columns = labels ))
154161
162+
163+ predicts = pipe .predict (X_val )
155164probs = pipe .predict_proba (X_val )
156165a_df = pd .DataFrame (probs , index = Y_val , columns = labels )
157166a_df [a_df .eq (0 )] = np .nan
158167print (a_df .round (2 ))
159168
169+ print (a_df .iloc [get_misclass ()])
170+
160171if keras :
161172 pipe .named_steps ['model' ].model .save ('keras_model.h5' )
162173 pipe .named_steps ['model' ].model = None
0 commit comments