-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy path_init_search.py
More file actions
257 lines (206 loc) · 7.24 KB
/
_init_search.py
File metadata and controls
257 lines (206 loc) · 7.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import os
import csv
import time
import logging
import datetime
import re
from datetime import tzinfo
import pytz
from pytz import timezone
from dateutil import parser
from twitter import *
logger = logging.getLogger("root")
logging.basicConfig(
format = "\033[1;36m%(levelname)s: %(filename)s (def %(funcName)s %(lineno)s): \033[1;37m %(message)s",
level=logging.DEBUG
)
TWITTER_CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY")
TWITTER_CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET")
TWITTER_ACCESS_TOKEN = os.environ.get("TWITTER_ACCESS_TOKEN")
TWITTER_ACCESS_TOKEN_SECRET = os.environ.get("TWITTER_ACCESS_TOKEN_SECRET")
LOCAL_TIMEZONE = pytz.timezone("US/Pacific")
TWITTER_TIMEZONE = timezone("UTC")
class TwitterHashtagSearch(object):
# you can really only search back 6 or 7 days
start_date_for_search = LOCAL_TIMEZONE.localize(datetime.datetime(2020, 3, 4, 8, 0))
# hashtag to search
hashtag = "#NICAR20"
# column names for our csv
# this will change if you pull in more data
csv_headers = [
"hashtag",
"tweet_utc_date",
"user_name",
"user_screen_name",
"bot_or_not",
"tweet_text",
"tweet_url",
"tweet_id",
"user_profile_image_url",
"user_location",
"source",
"in_reply_to_screen_name",
"in_reply_to_status_id",
"image_link",
"retweet_count",
"favorite_count",
"time_zone",
"geo_enabled",
"geography",
"coordinates",
"lang",
]
# what we'll name our csv file
csv_filename = "_%s_tweets.csv" % (hashtag)
def _init(self, *args, **kwargs):
"""
start the whole twitter hashtag search a rollin
"""
# default params for our loop
max_id = None
search_is_done = False
# set our date defaults for comparisons
start_date_utc = self.start_date_for_search.astimezone(TWITTER_TIMEZONE)
# open a file
with open(self.csv_filename, 'w') as csv_file:
# that will become our csv
csv_output = csv.writer(csv_file, delimiter=',', quoting=csv.QUOTE_ALL)
# write the header row to the csv file
csv_output.writerow(self.csv_headers)
# begin the loop
while (search_is_done == False):
# return our tweets
tweet_results = self.construct_twitter_search(self.hashtag, max_id)
# for each status
for tweet in tweet_results["statuses"]:
# get the UTC time for each
tweet_date = parser.parse(tweet["created_at"])
# set some timezone information
tweet_date = tweet_date.replace(tzinfo=TWITTER_TIMEZONE)
# if the tweet falls between our begin and end range
if tweet_date >= start_date_utc:
# build a new csv row
csv_row = self.build_csv_row_from(tweet, tweet_date)
# write the new csv row
csv_output.writerow(csv_row)
# if we get through the loop get the new max id, which is in effect paging
max_id = self.get_max_id(tweet_results)
# if no max_id
if max_id == None:
# end the loop
search_is_done = True
# otherwise
else:
# get more of them
print("Retrieving more tweets since %s" % (max_id))
def construct_twitter_search(self, hashtag, max_id):
"""
function to auth with twitter and return tweets
"""
# build the authorization for the twitter api
twitter_object = Twitter(
auth=OAuth(
TWITTER_ACCESS_TOKEN,
TWITTER_ACCESS_TOKEN_SECRET,
TWITTER_CONSUMER_KEY,
TWITTER_CONSUMER_SECRET
)
)
# retrieve the tweets
tweet_results = twitter_object.search.tweets(
q=hashtag,
count=100000,
result_type="recent",
include_entities=True,
max_id=max_id,
lang="en",
until="2020-03-12"
)
logger.debug(tweet_results)
# return them
return tweet_results
def build_csv_row_from(self, tweet, tweet_date):
"""
create a csv row from tweet results
"""
# construct url format
tweet_url = "https://twitter.com/" + tweet["user"]["screen_name"] + "/status/" + str(tweet["id"])
# output some information
print("%s - %s - %s" % (tweet_date, tweet["user"]["screen_name"], tweet_url))
# see if an image is present in the dictionary
has_image = "media" in tweet
# if there are images
if has_image == True:
# grab it
tweet_image = tweet["media"]["media_url_https"]
# otherwise
else:
# call it none
tweet_image = None
# build a row of tweet data
csv_row_data = [
self.hashtag,
tweet_date,
tweet["user"]["name"],
tweet["user"]["screen_name"],
self.check_text_for_bot(tweet["text"]),
tweet["text"],
tweet_url,
tweet["id"],
tweet["user"]["profile_image_url"],
tweet["user"]["location"],
tweet["source"],
tweet["in_reply_to_screen_name"],
tweet["in_reply_to_status_id_str"],
tweet_image,
tweet["retweet_count"],
tweet["favorite_count"],
tweet["user"]["time_zone"],
tweet["user"]["geo_enabled"],
tweet["geo"],
tweet["coordinates"],
tweet["lang"],
]
# print the row
print(csv_row_data)
# return the row
return csv_row_data
def check_text_for_bot(self, tweet_text):
"""
let's see if we can indentify a bot
nothing special here
knew some had the same message
"""
bot_check = re.compile("#NICAR20 View here ")
bot_match = re.search(bot_check, tweet_text)
try:
if bot_match:
is_bot = True
else:
is_bot = False
except:
is_bot = None
return is_bot
def get_max_id(self, results):
"""
get the max_id of the next twitter search if present
"""
# see if the metadata has a next_results key
# value is the idea to pull tweets from
more_tweets = "next_results" in results["search_metadata"]
# if there are more
if more_tweets == True:
# find the max id
parsed_string = results["search_metadata"]["next_results"].split("&")
parsed_string = parsed_string[0].split("?max_id=")
max_id = parsed_string[1]
# otherwise
else:
# max id is nothing
max_id = None
# return the max id
return max_id
if __name__ == '__main__':
task_run = TwitterHashtagSearch()
task_run._init()
print("\nTask finished at %s\n" % str(datetime.datetime.now()))