|
| 1 | +from sklearn import metrics |
| 2 | +from sklearn.metrics import classification_report |
| 3 | +from sklearn.naive_bayes import MultinomialNB |
| 4 | +from sklearn.linear_model import LogisticRegression |
| 5 | +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier |
| 6 | +from sklearn.metrics import accuracy_score |
| 7 | + |
| 8 | +from rating_prediction_lda import totalTopics,all_text_train, all_text_test,topic_dist_train_all_stars,topic_dist_test_all_stars |
| 9 | +from rating_prediction_tfidf import tfidfvectorizer |
| 10 | + |
| 11 | +def getSentiment(s): |
| 12 | + if s < 3.5: |
| 13 | + return 0 |
| 14 | + else: |
| 15 | + return 1 |
| 16 | + |
| 17 | + |
| 18 | +topic_dist_train_all_stars['Sentiment'] = topic_dist_train_all_stars['Star'].map(getSentiment) |
| 19 | +topic_dist_test_all_stars['Sentiment'] = topic_dist_test_all_stars['Star'].map(getSentiment) |
| 20 | + |
| 21 | +sentimentTextTrain = tfidfvectorizer.fit_transform(all_text_train) |
| 22 | +sentimentTextTest = tfidfvectorizer.transform(all_text_test) |
| 23 | + |
| 24 | +sentimentLabelTrain = topic_dist_train_all_stars['Sentiment'] |
| 25 | +sentimentLabelTest = topic_dist_test_all_stars['Sentiment'] |
| 26 | + |
| 27 | +classifier = LogisticRegression().fit(sentimentTextTrain, sentimentLabelTrain) |
| 28 | + |
| 29 | +ySentimentTrain = classifier.predict(sentimentTextTrain) |
| 30 | +ySentimentTest = classifier.predict(sentimentTextTest) |
| 31 | + |
| 32 | +topic_dist_train_all_stars['Sentiment_Predicted'] = ySentimentTrain |
| 33 | +topic_dist_test_all_stars['Sentiment_Predicted'] = ySentimentTest |
| 34 | + |
| 35 | + |
| 36 | +features = list(topic_dist_train_all_stars.columns[:totalTopics]) |
| 37 | +features.append(topic_dist_train_all_stars.columns[totalTopics+2]) |
| 38 | + |
| 39 | + |
| 40 | +x_train = topic_dist_train_all_stars[features] |
| 41 | +y_train = topic_dist_train_all_stars['Star'] |
| 42 | + |
| 43 | +x_test = topic_dist_test_all_stars[features] |
| 44 | +y_test = topic_dist_test_all_stars['Star'] |
| 45 | + |
| 46 | +classifiers = [MultinomialNB(), LogisticRegression(), RandomForestClassifier(n_estimators=100, n_jobs=2), AdaBoostClassifier(n_estimators=100)] |
| 47 | +classifiers_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest', 'AdaBoost'] |
| 48 | + |
| 49 | +LdaSentimentResults = {} |
| 50 | +for (i, clf_) in enumerate(classifiers): |
| 51 | + clf = clf_.fit(x_train, y_train) |
| 52 | + preds = clf.predict(x_test) |
| 53 | + |
| 54 | + precision = metrics.precision_score(y_test, preds) |
| 55 | + recall = metrics.recall_score(y_test, preds) |
| 56 | + f1 = metrics.f1_score(y_test, preds) |
| 57 | + accuracy = accuracy_score(y_test, preds) |
| 58 | + report = classification_report(y_test, preds) |
| 59 | + matrix = metrics.confusion_matrix(y_test, preds, labels=starsGroup.groups.keys()) |
| 60 | + |
| 61 | + data = {'precision':precision, |
| 62 | + 'recall':recall, |
| 63 | + 'f1_score':f1, |
| 64 | + 'accuracy':accuracy, |
| 65 | + 'clf_report':report, |
| 66 | + 'clf_matrix':matrix, |
| 67 | + 'y_predicted':preds} |
| 68 | + |
| 69 | + LdaSentimentResults[classifiers_names[i]] = data |
| 70 | + |
| 71 | + |
| 72 | + |
| 73 | +cols = ['precision', 'recall', 'f1_score', 'accuracy'] |
| 74 | +pd.DataFrame(LdaSentimentResults).T[cols].T |
| 75 | + |
| 76 | +for model, val in LdaSentimentResults.iteritems(): |
| 77 | + print '-------'+'-'*len(model) |
| 78 | + print 'MODEL:', model |
| 79 | + print '-------'+'-'*len(model) |
| 80 | + print 'The precision for this classifier is ' + str(val['precision']) |
| 81 | + print 'The recall for this classifier is ' + str(val['recall']) |
| 82 | + print 'The f1 for this classifier is ' + str(val['f1_score']) |
| 83 | + print 'The accuracy for this classifier is ' + str(val['accuracy']) |
| 84 | + print 'The confusion matrix for this classifier is \n' + str(val['clf_matrix']) |
| 85 | + print '\nHere is the classification report:' |
| 86 | + print val['clf_report'] |
0 commit comments