22import csv
33from flair .data import Sentence
44from flair .models import SequenceTagger
5- from flair .embeddings import WordEmbeddings , FlairEmbeddings , StackedEmbeddings , DocumentPoolEmbeddings , BertEmbeddings , ELMoEmbeddings , OpenAIGPTEmbeddings
5+ from flair .embeddings import WordEmbeddings , FlairEmbeddings , StackedEmbeddings , DocumentPoolEmbeddings , BertEmbeddings , ELMoEmbeddings , OpenAIGPTEmbeddings , RoBERTaEmbeddings , XLNetEmbeddings
66import torch
77from torch import tensor
88import numpy as np
3030from keras .callbacks import ModelCheckpoint
3131from keras .wrappers .scikit_learn import KerasClassifier
3232from keras .models import Sequential
33- from keras .layers import Dense
33+ from keras .layers import Dense , Conv1D , MaxPooling1D , Flatten , Embedding , Reshape , Input , SimpleRNN , LSTM
34+ import torch .nn as nn
35+ import torch .nn .functional as F
3436
3537
36- keras = False
38+ keras = True
3739
3840def parse_string (a_str ):
3941 to_ret = "" .join ([c .lower () for c in a_str if c in string .ascii_letters or c in string .whitespace ])
@@ -54,7 +56,8 @@ def transform(self, X):
5456 size_of_emb = list_of_embeddings [1 ].size
5557 if not isinstance (X , str ):
5658 for doc in X :
57- p_str = parse_string (doc )
59+ #p_str = parse_string(doc)
60+ p_str = doc
5861 if not p_str :
5962 list_of_emb .append (np .zeros ((size_of_emb ,), dtype = np .float32 ))##TODO: don't hard code vector size
6063 else :
@@ -64,7 +67,8 @@ def transform(self, X):
6467 to_ret = np .array (list_of_emb )
6568 else :
6669 try :
67- p_str = parse_string (X )
70+ #p_str = parse_string(X)
71+ p_str = X
6872 if not p_str :
6973 to_ret = np .zeros ((size_of_emb ,), dtype = np .float32 )##TODO here too
7074 else :
@@ -78,8 +82,9 @@ def transform(self, X):
7882
7983
8084
81- stacked_embeddings = DocumentPoolEmbeddings ([WordEmbeddings ('en' ),
82- WordEmbeddings ('glove' ), WordEmbeddings ('extvec' )])
85+ stacked_embeddings = DocumentPoolEmbeddings ([#WordEmbeddings('en'),
86+ #WordEmbeddings('glove'),
87+ WordEmbeddings ('en-crawl' )])
8388
8489with open ('card_classification.csv' ) as csvfile :
8590 reader = csv .reader (csvfile )
@@ -89,31 +94,37 @@ def transform(self, X):
8994 for row in reader :
9095 list_of_labels .append (row [0 ])
9196 parsed_string = parse_string (row [1 ])
97+ parsed_string = row [1 ]
9298 list_of_sentences .append (parsed_string )
9399 set_obj = Sentence (parsed_string )
94100 stacked_embeddings .embed (set_obj )
95101 list_of_embeddings .append (set_obj .get_embedding ().cpu ().detach ().numpy ())
96102
97103
98- X_train , X_val , Y_train , Y_val , Emb_train , Emb_val = train_test_split (list_of_sentences , list_of_labels , list_of_embeddings , test_size = 0.30 , stratify = list_of_labels , random_state = 42 )
99-
104+ X_train , X_val , Y_train , Y_val , Emb_train , Emb_val = train_test_split (np .asarray (list_of_sentences ), np .asarray (list_of_labels ), np .asarray (list_of_embeddings ), test_size = 0.30 , stratify = list_of_labels , random_state = 42 )
100105
106+ print (list_of_embeddings [1 ].size )
101107
102108def create_model (optimizer = 'adam' , kernel_initializer = 'glorot_uniform' , epochs = 5 ):
103109 model = Sequential ()
110+ #model.add(Reshape((137, 1, 400), input_shape = (137, 400)))
111+ #model.add(Conv1D(64, 1, activation='relu'))
104112 model .add (Dense (list_of_embeddings [1 ].size , activation = 'relu' ,kernel_initializer = 'he_uniform' , use_bias = False ))
105- model .add (Dense (11 ,activation = 'softmax' ,kernel_initializer = kernel_initializer , use_bias = False ))
113+ #model.add(LSTM(list_of_embeddings[1].size, return_sequences = True,))
114+ model .add (Dense (len (np .unique (Y_val )),activation = 'softmax' ,kernel_initializer = kernel_initializer , use_bias = False ))
106115 model .compile (loss = 'categorical_crossentropy' ,optimizer = optimizer , metrics = ['accuracy' ])
107116 return model
108117
109118
110119
111120if keras :
112121 checkpointer = ModelCheckpoint (filepath = '/tmp/weights.hdf5' , verbose = 1 , save_best_only = True )
113- model = KerasClassifier (build_fn = create_model , batch_size = 32 , epochs = 100 , callbacks = [checkpointer ], validation_split = 0.2 )
122+ model = KerasClassifier (build_fn = create_model , batch_size = 32 , epochs = 150 , callbacks = [checkpointer ], validation_split = 0.2 )
123+
124+
114125
115126#model = SVC(kernel = "rbf", probability = True)
116- model = KNeighborsClassifier (n_neighbors = 5 , metric = 'cosine' , weights = 'distance' )
127+ # model = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights = 'distance')
117128#model = AdaBoostClassifier(n_estimators = 100, random_state = 42)
118129#model = RandomForestClassifier(n_jobs = -1, n_estimators = 100, max_features = "auto", criterion = "entropy")
119130#model = MLPClassifier(hidden_layer_sizes=(500,), activation = 'relu', solver = 'adam', verbose = True, max_iter = 100) #early_stopping = True, validation_fraction = 0.3, n_iter_no_change = 100)
@@ -138,7 +149,7 @@ def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs =
138149probs = pipe .predict_proba (X_val )
139150a_df = pd .DataFrame (probs , index = Y_val , columns = labels )
140151a_df [a_df .eq (0 )] = np .nan
141- print (a_df )
152+ print (a_df . round ( 2 ) )
142153
143154if keras :
144155 pipe .named_steps ['model' ].model .save ('keras_model.h5' )
0 commit comments