Skip to content

Commit 685f909

Browse files
committed
Added python files for rating prediction using TFIDF and LDA+sentiment
1 parent a7312e9 commit 685f909

File tree

3 files changed

+150
-1
lines changed

3 files changed

+150
-1
lines changed

rating_prediction_lda.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,10 @@ def getTopicDistMatrix(lda, totalTopics, corpus, all_dist, star):
194194
print "Number of 2-star reviews after processing: ", len(corpus_2stars_test)
195195
print "Number of 1-star reviews after processing: ", len(corpus_1stars_test)
196196

197-
197+
all_5_4_test = np.append(corpus_5stars_test, corpus_4stars_test)
198+
all_5_4_3_test = np.append(all_5_4_test, corpus_3stars_test)
199+
all_5_4_3_2_test = np.append(all_5_4_3_test, corpus_2stars_test)
200+
all_text_test = np.append(all_5_4_3_2_test, corpus_1stars_test)
198201

199202
topic_dist_list = []
200203

rating_prediction_lda_sentiment.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from sklearn import metrics
2+
from sklearn.metrics import classification_report
3+
from sklearn.naive_bayes import MultinomialNB
4+
from sklearn.linear_model import LogisticRegression
5+
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
6+
from sklearn.metrics import accuracy_score
7+
8+
from rating_prediction_lda import totalTopics,all_text_train, all_text_test,topic_dist_train_all_stars,topic_dist_test_all_stars
9+
from rating_prediction_tfidf import tfidfvectorizer
10+
11+
def getSentiment(s):
12+
if s < 3.5:
13+
return 0
14+
else:
15+
return 1
16+
17+
18+
topic_dist_train_all_stars['Sentiment'] = topic_dist_train_all_stars['Star'].map(getSentiment)
19+
topic_dist_test_all_stars['Sentiment'] = topic_dist_test_all_stars['Star'].map(getSentiment)
20+
21+
sentimentTextTrain = tfidfvectorizer.fit_transform(all_text_train)
22+
sentimentTextTest = tfidfvectorizer.transform(all_text_test)
23+
24+
sentimentLabelTrain = topic_dist_train_all_stars['Sentiment']
25+
sentimentLabelTest = topic_dist_test_all_stars['Sentiment']
26+
27+
classifier = LogisticRegression().fit(sentimentTextTrain, sentimentLabelTrain)
28+
29+
ySentimentTrain = classifier.predict(sentimentTextTrain)
30+
ySentimentTest = classifier.predict(sentimentTextTest)
31+
32+
topic_dist_train_all_stars['Sentiment_Predicted'] = ySentimentTrain
33+
topic_dist_test_all_stars['Sentiment_Predicted'] = ySentimentTest
34+
35+
36+
features = list(topic_dist_train_all_stars.columns[:totalTopics])
37+
features.append(topic_dist_train_all_stars.columns[totalTopics+2])
38+
39+
40+
x_train = topic_dist_train_all_stars[features]
41+
y_train = topic_dist_train_all_stars['Star']
42+
43+
x_test = topic_dist_test_all_stars[features]
44+
y_test = topic_dist_test_all_stars['Star']
45+
46+
classifiers = [MultinomialNB(), LogisticRegression(), RandomForestClassifier(n_estimators=100, n_jobs=2), AdaBoostClassifier(n_estimators=100)]
47+
classifiers_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest', 'AdaBoost']
48+
49+
LdaSentimentResults = {}
50+
for (i, clf_) in enumerate(classifiers):
51+
clf = clf_.fit(x_train, y_train)
52+
preds = clf.predict(x_test)
53+
54+
precision = metrics.precision_score(y_test, preds)
55+
recall = metrics.recall_score(y_test, preds)
56+
f1 = metrics.f1_score(y_test, preds)
57+
accuracy = accuracy_score(y_test, preds)
58+
report = classification_report(y_test, preds)
59+
matrix = metrics.confusion_matrix(y_test, preds, labels=starsGroup.groups.keys())
60+
61+
data = {'precision':precision,
62+
'recall':recall,
63+
'f1_score':f1,
64+
'accuracy':accuracy,
65+
'clf_report':report,
66+
'clf_matrix':matrix,
67+
'y_predicted':preds}
68+
69+
LdaSentimentResults[classifiers_names[i]] = data
70+
71+
72+
73+
cols = ['precision', 'recall', 'f1_score', 'accuracy']
74+
pd.DataFrame(LdaSentimentResults).T[cols].T
75+
76+
for model, val in LdaSentimentResults.iteritems():
77+
print '-------'+'-'*len(model)
78+
print 'MODEL:', model
79+
print '-------'+'-'*len(model)
80+
print 'The precision for this classifier is ' + str(val['precision'])
81+
print 'The recall for this classifier is ' + str(val['recall'])
82+
print 'The f1 for this classifier is ' + str(val['f1_score'])
83+
print 'The accuracy for this classifier is ' + str(val['accuracy'])
84+
print 'The confusion matrix for this classifier is \n' + str(val['clf_matrix'])
85+
print '\nHere is the classification report:'
86+
print val['clf_report']

rating_prediction_tfidf.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from sklearn import metrics
2+
from sklearn.metrics import classification_report
3+
from sklearn.naive_bayes import MultinomialNB
4+
from sklearn.feature_extraction.text import TfidfVectorizer
5+
from sklearn.linear_model import LogisticRegression
6+
from sklearn.metrics import accuracy_score
7+
8+
from rating_prediction_lda import starsGroup,all_text_train, all_text_test,topic_dist_train_all_stars,topic_dist_test_all_stars
9+
10+
# Extracting features using term frequency
11+
tfidfvectorizer = TfidfVectorizer()
12+
13+
tfidfTrain = tfidfvectorizer.fit_transform(all_text_train)
14+
tfidfTest = tfidfvectorizer.transform(all_text_test)
15+
16+
tfidfLabelTrain = topic_dist_train_all_stars['Star']
17+
tfidfLabelTest = topic_dist_train_all_stars['Star']
18+
19+
classifiers = [MultinomialNB(), LogisticRegression()]
20+
classifiers_names = ['Multinomial Naive Bayes', 'Logistic Regression']
21+
22+
TFIDF_Pred_Results = {}
23+
for (i, clf_) in enumerate(classifiers):
24+
clf = clf_.fit(tfidfTrain, tfidfLabelTrain)
25+
prediction = clf.predict(tfidfTest)
26+
27+
precision = metrics.precision_score(tfidfLabelTrain, prediction)
28+
recall = metrics.recall_score(tfidfLabelTest, prediction)
29+
f1 = metrics.f1_score(tfidfLabelTest, prediction)
30+
accuracy = accuracy_score(tfidfLabelTest, prediction)
31+
report = classification_report(tfidfLabelTest, prediction)
32+
matrix = metrics.confusion_matrix(tfidfLabelTest, prediction, labels=starsGroup.groups.keys())
33+
34+
data = {'precision':precision,
35+
'recall':recall,
36+
'f1_score':f1,
37+
'accuracy':accuracy,
38+
'clf_report':report,
39+
'clf_matrix':matrix,
40+
'y_predicted':preds}
41+
42+
TFIDF_Pred_Results[classifiers_names[i]] = data
43+
44+
cols = ['precision', 'recall', 'f1_score', 'accuracy']
45+
pd.DataFrame(TFIDF_Pred_Results).T[cols].T
46+
47+
48+
49+
for model, val in TFIDF_Pred_Results.iteritems():
50+
print '-------'+'-'*len(model)
51+
print 'MODEL:', model
52+
print '-------'+'-'*len(model)
53+
print 'The precision for this classifier is ' + str(val['precision'])
54+
print 'The recall for this classifier is ' + str(val['recall'])
55+
print 'The f1 for this classifier is ' + str(val['f1_score'])
56+
print 'The accuracy for this classifier is ' + str(val['accuracy'])
57+
print 'The confusion matrix for this classifier is \n' + str(val['clf_matrix'])
58+
print '\nHere is the classification report:'
59+
print val['clf_report']
60+

0 commit comments

Comments
 (0)