Skip to content

Commit 04eb017

Browse files
Add files via upload
1 parent 61dcc2d commit 04eb017

1 file changed

Lines changed: 32 additions & 18 deletions

File tree

classification.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import csv
33
from flair.data import Sentence
44
from flair.models import SequenceTagger
5-
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings, OpenAIGPTEmbeddings, RoBERTaEmbeddings, XLNetEmbeddings, BytePairEmbeddings
5+
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, ELMoEmbeddings, OpenAIGPTEmbeddings, RoBERTaEmbeddings, XLNetEmbeddings, BytePairEmbeddings, XLNetEmbeddings, OpenAIGPT2Embeddings
66
import torch
77
from torch import tensor
88
import numpy as np
@@ -30,12 +30,13 @@
3030
from keras.callbacks import ModelCheckpoint
3131
from keras.wrappers.scikit_learn import KerasClassifier
3232
from keras.models import Sequential
33-
from keras.layers import Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Embedding, Reshape, Input, SimpleRNN, LSTM, InputLayer, GRU
33+
from keras.layers import Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Embedding, Reshape, Input, SimpleRNN, LSTM, InputLayer, GRU, GlobalMaxPooling1D
3434
import torch.nn as nn
3535
import torch.nn.functional as F
3636

3737

3838
keras = True
39+
keras_mode = "MLP"
3940

4041
def parse_string(a_str):
4142
to_ret = "".join([c.lower() for c in a_str if c in string.ascii_letters or c in string.whitespace])
@@ -45,7 +46,6 @@ def parse_string(a_str):
4546

4647

4748
def get_misclass():
48-
print("misclassified examples!!!")
4949
return np.where(Y_val != pipe.predict(X_val))
5050

5151
class Text2Vec( BaseEstimator, TransformerMixin):
@@ -88,8 +88,12 @@ def transform(self, X):
8888

8989

9090
stacked_embeddings = DocumentPoolEmbeddings([WordEmbeddings('en'),
91-
#WordEmbeddings('glove'),
92-
#WordEmbeddings('en-crawl',
91+
#XLNetEmbeddings('base-cased')
92+
#OpenAIGPT2Embeddings(),
93+
#FlairEmbeddings('news-forward-fast'),
94+
#FlairEmbeddings('news-backward-fast'),
95+
WordEmbeddings('glove'),
96+
WordEmbeddings('en-crawl'),
9397
#BytePairEmbeddings('en', 300),
9498
])
9599

@@ -114,25 +118,31 @@ def transform(self, X):
114118

115119
def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs = 5):
116120
model = Sequential()
117-
#model.add(InputLayer(input_shape=(153, 1, 300)))
118-
#print(model.output_shape)
119-
model.add(Reshape((1, list_of_embeddings[1].size), input_shape = Emb_train.shape[1:])) ##magical fucking stupid keras BS needed for RNN/CNN
120-
#print(model.output_shape)
121-
#model.add(Conv1D(filters=20, kernel_size=1, activation='relu')) ##works now
122-
model.add(GRU(list_of_embeddings[1].size)) ##this works too - seems to be better for smaller datasets too!
123-
#print(model.output_shape)
124-
#model.add(Flatten())
125-
#model.add(Dense(list_of_embeddings[1].size, activation='relu',kernel_initializer='he_uniform', use_bias = False))
126-
#model.add(LSTM(list_of_embeddings[1].size, return_sequences = True,))
127-
model.add(Dense(len(np.unique(Y_val)),activation='softmax',kernel_initializer=kernel_initializer, use_bias = False))
121+
122+
if keras_mode == "CNN":
123+
model.add(Reshape((1, list_of_embeddings[1].size), input_shape = Emb_train.shape[1:])) ##magical fucking stupid keras BS needed for RNN/CNN
124+
model.add(Conv1D(filters=50, kernel_size=1, activation='relu')) ##works now
125+
model.add(Flatten()) ##need this with Conv1D
126+
#model.add(GlobalMaxPooling1D()) ##pooling would go here instead of flattening if you're into that
127+
model.add(Dense(len(np.unique(Y_val)),activation='softmax',kernel_initializer=kernel_initializer, use_bias = False))
128+
129+
elif keras_mode == "RNN":
130+
model.add(Reshape((1, list_of_embeddings[1].size), input_shape = Emb_train.shape[1:]))
131+
model.add(GRU(list_of_embeddings[1].size)) ##this works too - seems to be better for smaller datasets too!
132+
model.add(Dense(len(np.unique(Y_val)),activation='softmax',kernel_initializer=kernel_initializer, use_bias = False))
133+
134+
else: ##for simple MLP models
135+
model.add(Dense(list_of_embeddings[1].size, activation='relu',kernel_initializer='he_uniform', use_bias = False))
136+
model.add(Dense(len(np.unique(Y_val)),activation='softmax',kernel_initializer=kernel_initializer, use_bias = False))
137+
128138
model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
129139
return model
130140

131141

132142

133143
if keras:
134144
checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)
135-
model = KerasClassifier(build_fn=create_model, batch_size = 32, epochs = 50, callbacks=[checkpointer], validation_split = 0.2)
145+
model = KerasClassifier(build_fn=create_model, batch_size = 32, epochs = 400, callbacks=[checkpointer], validation_split = 0.2)
136146

137147

138148

@@ -166,7 +176,11 @@ def create_model(optimizer='adam', kernel_initializer='glorot_uniform', epochs =
166176
a_df[a_df.eq(0)] = np.nan
167177
print(a_df.round(2))
168178

169-
print(a_df.iloc[get_misclass()])
179+
misclass = get_misclass()
180+
181+
print("misclassified examples!!!")
182+
print(get_misclass())
183+
print(a_df.iloc[get_misclass()].round(2))
170184

171185
if keras:
172186
pipe.named_steps['model'].model.save('keras_model.h5')

0 commit comments

Comments
 (0)