Skip to content

Commit 4a97cd6

Browse files
authored
Merge pull request #2 from xiaotaichai/lydia
Merge Normalized Revisions
2 parents cccf1b4 + 327e23f commit 4a97cd6

9 files changed

+190
-15
lines changed

blank_line_check.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from mrjob.job import MRJob
2+
3+
4+
class BlankLineCheck(MRJob):
5+
6+
def mapper(self, _, line):
7+
8+
line = line.strip()
9+
10+
if line:
11+
yield 'Not blank', 1
12+
else:
13+
yield 'Blank', 1
14+
15+
def reducer(self, key, values):
16+
yield key, sum(values)
17+
18+
19+
if __name__ == '__main__':
20+
BlankLineCheck.run()

checking_format.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from mrjob.job import MRJob
2+
import gzip
3+
4+
class CheckFormat(MRJob):
5+
6+
def mapper(self, _, line):
7+
8+
record = line.split('\x1e')
9+
revision_info = record[0].split(' ')
10+
record_length = len(record)
11+
rev_info_length = len(revision_info)
12+
13+
if rev_info_length != 7:
14+
if record_length != 13:
15+
yield 'record length and revision info length is bad', 1
16+
else:
17+
yield 'revision info length bad, record length good', 1
18+
else:
19+
if record_length != 13:
20+
yield 'record length bad, revision info length good', 1
21+
else:
22+
yield 'revision info length and record length good', 1
23+
24+
25+
def reducer(self, key, values):
26+
yield key, sum(values)
27+
28+
29+
if __name__ == '__main__':
30+
CheckFormat.run()
Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
from mrjob.job import MRJob
22
from mrjob.protocol import TextValueProtocol
33

4-
class RandomSubsample(MRJob):
4+
class RevisionCountTimeline(MRJob):
55

66
#OUTPUT_PROTOCOL = TextValueProtocol
77

88
def mapper(self, _, line):
99

10-
article_id = line.split('<<sep>>')[0]
11-
article_name = line.split('<<sep>>')[2]
12-
revision_date = line.split('<<sep>>')[3]
10+
record = line.split('\x1e')
11+
article_info = record[0].split(' ')
12+
13+
article_id = article_info[1]
14+
article_name = article_info[3]
15+
revision_date = article_info[4]
1316

1417
yield [article_id, article_name], revision_date
1518

@@ -19,9 +22,9 @@ def reducer(self, key, records):
1922
year, month = record.split('-')[0], record.split('-')[1]
2023
index = (int(year) - 2001)*12 + int(month) - 1
2124
monthly_revision_count[index] += 1
22-
25+
2326
normalized = [float(i)/sum(monthly_revision_count) for i in monthly_revision_count]
2427
yield key, normalized
25-
28+
2629
if __name__ == '__main__':
27-
RandomSubsample.run()
30+
RevisionCountTimeline.run()

create_normalized_revision_lengths_timeline.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,30 @@ class RandomSubsample(MRJob):
77

88
def mapper(self, _, line):
99

10-
article_id = line.split('<<sep>>')[0]
11-
article_name = line.split('<<sep>>')[2]
12-
revision_date = line.split('<<sep>>')[3]
13-
revision_length = line.split('<<sep>>')[-1]
10+
#
11+
# article_id = line.split('<<sep>>')[0]
12+
# article_name = line.split('<<sep>>')[2]
13+
# revision_date = line.split('<<sep>>')[3]
14+
# revision_length = line.split('<<sep>>')[-1]
15+
line = line.strip()
16+
if line:
17+
parts = line.split('<<sep>>')
18+
article_id = parts[0]
19+
article_name = parts[2]
20+
revision_date = parts[3]
21+
revision_length = parts[-1]
1422

15-
yield [article_id, article_name], [revision_date, revision_length]
23+
yield [article_id, article_name], [revision_date, revision_length]
1624

1725
def reducer(self, key, records):
1826
monthly_revision_count = [0]*96
1927
for record in records:
2028
year, month = record[0].split('-')[0], record[0].split('-')[1]
2129
index = (int(year) - 2001)*12 + int(month) - 1
2230
monthly_revision_count[index] += int(record[1])
23-
31+
2432
normalized = [float(i)/sum(monthly_revision_count) for i in monthly_revision_count]
2533
yield key, normalized
26-
34+
2735
if __name__ == '__main__':
2836
RandomSubsample.run()

creation_timelines_toCSV.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import os
2+
import re
3+
4+
progress = 0
5+
outfile = open('./creation_timelines_5yrs.csv', 'w')
6+
# write header
7+
outfile.write('article_id,article_name,creation_datetime,num_revisions,all_revisions\n')
8+
9+
line_pattern = '\["([0-9]+?)", "(.+?)"\]\s*\["([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})", ([0-9]+?), \[(.+)\]\]'
10+
with open('./creation_timelines_5yrs_new.txt','r') as infile:
11+
for line in infile:
12+
parts = list(re.findall(line_pattern, line)[0])
13+
# remove qutoes from the revision authour usernames
14+
parts[4] = re.sub('"','',parts[4])
15+
16+
new_line = '{0},"{1}",{2},{3},"{4}"\n'.format(parts[0],parts[1],parts[2],parts[3],parts[4])
17+
outfile.write(new_line)
18+
19+
progress += 1
20+
21+
if progress%10000 == 0:
22+
print('{} lines processed so far'.format(progress))
23+
outfile.close()
24+
print('Done')

get_random_subsample.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import random
44

55
# read in list of unique article id's
6-
with gzip.open('unique_all_articleids.gz','rt') as infile:
6+
with gzip.open('/Akamai_scratch/fanny_kevin_lydia_xiaotai/Wikipedia-Edits-Distributed-Computing/unique_all_articleids.gz','rt') as infile:
77
all_ids = infile.readlines()
88

99
# select n of those id's and strip the new lines

mrjob.conf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
runners:
2+
local:
3+
local_tmp_dir: /Akamai_scratch/
4+
cleanup_on_failure: ALL
5+
inline:
6+
local_tmp_dir: /Akamai_scratch/
7+
cleanup_on_failure: ALL

mrjob2.conf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
runners:
2+
local:
3+
local_tmp_dir: /Akamai/
4+
cleanup_on_failure: ALL
5+
inline:
6+
local_tmp_dir: /Akamai/
7+
cleanup_on_failure: ALL

revision_count_timeline_2.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from mrjob.job import MRJob
2+
from mrjob.step import MRStep
3+
# from mrjob.protocol import TextValueProtocol
4+
import datetime as dt
5+
6+
class RevisionTimeline(MRJob):
7+
8+
# OUTPUT_PROTOCOL = TextValueProtocol
9+
10+
def mapperGroupRevisions(self, _, line):
11+
record = line.split('\x1e')
12+
article_info = record[0].split(' ')
13+
14+
article_id = article_info[1]
15+
article_name = article_info[3]
16+
17+
revision_datetime_str = article_info[4]
18+
user_name = article_info[5]
19+
user_id = article_info[6]
20+
revision_length = int(record[12].split(' ')[1])
21+
minor_flag = int(record[11].split(' ')[1])
22+
23+
yield [article_id, article_name], [revision_datetime_str, revision_length, minor_flag, user_name, user_id]
24+
25+
26+
def reducerCreateTimeline(self, key, revisions):
27+
creation_datetime = dt.datetime.now()
28+
revisions = list(revisions)
29+
30+
for r in revisions:
31+
revision_datetime = dt.datetime.strptime(r[0],'%Y-%m-%dT%H:%M:%SZ')
32+
if revision_datetime < creation_datetime:
33+
creation_datetime = revision_datetime
34+
35+
36+
# check if creation was before 01-01-03, so each article has at least a five year history
37+
38+
if creation_datetime < dt.datetime.strptime('2003-01-01','%Y-%m-%d'):
39+
num_revisions = len(revisions)
40+
normalized_revision_timeline = [[] for i in range(num_revisions)]
41+
42+
# creation_datetime = dt.datetime.strptime(values[1],'%Y-%m-%dT%H:%M:%SZ')
43+
# revisions = values[0]
44+
i = 0
45+
for r in revisions:
46+
revision_datetime = dt.datetime.strptime(r[0],'%Y-%m-%dT%H:%M:%SZ')
47+
time_since_creation = revision_datetime - creation_datetime
48+
normalized_revision_timeline[i] = [time_since_creation.days, time_since_creation.seconds, r[1], r[2],r[3],r[4]]
49+
i += 1
50+
51+
52+
yield key , [creation_datetime.strftime('%Y-%m-%d %H:%M:%S'), num_revisions, normalized_revision_timeline]
53+
# yield key , [revisions, creation_datetime_str, num_revisions]
54+
55+
# def reducerCreateTimeline(self, key, values):
56+
#
57+
# num_revisions = values[2]
58+
# normalized_revision_timeline = []*num_revisions
59+
#
60+
# creation_datetime = dt.datetime.strptime(values[1],'%Y-%m-%dT%H:%M:%SZ')
61+
# revisions = values[0]
62+
# i = 0
63+
# for r in revisions:
64+
# revision_datetime = dt.datetime.strptime(r[0],'%Y-%m-%dT%H:%M:%SZ')
65+
# time_since_creation = revision_datetime - creation_datetime
66+
# normalized_revision_timeline[i] = [time_since_creation.days, time_since_creation.seconds, r[1], r[2]]
67+
# i += 1
68+
# yield key + (creation_datetime.strftime('%Y-%m-%d %H:%M:%S'),num_revisions), normalized_revision_timeline
69+
def steps(self):
70+
return [
71+
MRStep(mapper=self.mapperGroupRevisions,
72+
reducer=self.reducerCreateTimeline)
73+
]
74+
75+
if __name__ == '__main__':
76+
RevisionTimeline.run()

0 commit comments

Comments
 (0)