-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNLP.py
More file actions
86 lines (67 loc) · 2.77 KB
/
NLP.py
File metadata and controls
86 lines (67 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 3 15:28:15 2018
@author: deepak
"""
import pandas as pd
import numpy as np
from nltk import word_tokenize, pos_tag
from keras.preprocessing.text import Tokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
#list of words which are irrelevant for prediction
noise_words = ["...", "hmm", ":", ","]
#initialize required modules
stoplist = stopwords.words('english')
lem = WordNetLemmatizer()
stem = PorterStemmer()
t = Tokenizer()
train_set = pd.read_csv("train.csv");
test_set = pd.read_csv("test.csv");
train = train_set.iloc[:, 3]
train_label = train_set.iloc[:, 4]
test = test_set.iloc[:, 4]
def preprocess(sentence):
#split full sentence to array of words
all_words = word_tokenize(sentence)
#parts of speech tagging with each words keeps the context of a sentence ever if they are shuffled
words_with_pos = pos_tag(all_words)
#remove the stop words and lemmatize (getting root meaning of word)
noise_free_words = [lem.lemmatize(word[0], "v")+"_"+word[1] for word in words_with_pos if (word[0] not in noise_words) and ( not word[0] in stoplist)]
#needed join so that keras Tokenizer can work
return " ".join(noise_free_words)
train_processed = [preprocess(text) for text in train]
test_processed = [preprocess(text) for text in test]
t.fit_on_texts(train_processed)
t.fit_on_texts(test_processed)
#create matrix of the each sentence
encoded_train = t.texts_to_matrix(train_processed, mode='count')
encoded_test = t.texts_to_matrix(test_processed, mode='count')
#define the model
#convolution model: A feature extraction model that learns to extract features from text
model = Sequential()
model.add(Embedding(encoded_train.shape[0] , 100, input_length = encoded_train.shape[1]))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#start training
model.fit(encoded_train, train_label, batch_size = 32, epochs = 10)
#make prediction on test
test_pred = model.predict(encoded_test)
y = np.where(test_pred>0.5,1,0)
#prepare the result to save to a file
submission = np.hstack((test_set.loc[:, ["COMMENT_ID"]].values, y))
submission = pd.DataFrame(submission)
submission.columns = ["COMMENT_ID", "CLASS"]
submission.to_csv('submission.csv')