Reddit-Flair-Detector/helper.py at master · Jap-Leen/Reddit-Flair-Detector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import joblib
import nltk
from nltk.corpus import stopwords
import praw as pr
import pandas as pd
import sklearn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

stops = set(stopwords.words("english"))
model = joblib.load("./finalized_model.sav")

# Creating an instance of Reddit
reddit = pr.Reddit(client_id='Pr1H4ZD88nm0ag',
                   client_secret='6wjk3Y6PnD-P1FunblDuP0KCibs',
                   password='reddit@123',
                   user_agent='Reddit Flair Detector',
                   username='JapLeen')

def ConvertToString(value):
    return str(value)

def Lemmatization(text):
	token_words = word_tokenize(text)
	ls = WordNetLemmatizer()
	list_lemma = [ls.lemmatize(word) for word in token_words if word.isalnum()]
	text = (" ".join(list_lemma))
	return text

def Stemming(text):
	token_words = word_tokenize(text)
	ps = PorterStemmer()
	list_stem = [ps.stem(word) for word in token_words if word.isalnum()]
	text = (" ".join(list_stem))
	return text

def RemoveStopwords(text):
    text = ' '.join(word for word in text.split() if word not in stops)
    return text

def PreProcessing(df, feature):
	df[feature] = df[feature].apply(ConvertToString)
	df[feature] = df[feature].str.lower()
	# df[feature] = df[feature].apply(Stemming)
	# df[feature] = df[feature].apply(Lemmatization)
	df[feature] = df[feature].apply(RemoveStopwords)

def helper(url):
	submission_info = {"id":[], "title":[], "body":[], "comments":[]}
	submission = reddit.submission(url = url)
	submission_info["id"] = submission.id
	submission_info["title"] = submission.title
	submission_info["body"] = submission.selftext
	comment = ''
	submission.comments.replace_more(limit=0)
	for comment_c in submission.comments:
		comment+= ' ' + comment_c.body
	submission_info["comments"].append(comment)
	df = pd.DataFrame(submission_info)
	df.fillna("")
	selected_features = ['title', 'body', 'comments']
	# Pre-processing the text contained in the selected features
	for feature in selected_features:
		PreProcessing(df, feature)
	# Getting combination of features
	combination_of_features = df["title"] + df["comments"] + df["body"]
	df = df.assign(combination_of_features = combination_of_features)
	return(model.predict(df['combination_of_features'])[0])