Skip to content

Commit 45a99c2

Browse files
Piping data to neo4J
1 parent c501709 commit 45a99c2

File tree

7 files changed

+117
-41
lines changed

7 files changed

+117
-41
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
*.csv
22
Inbox
33
sentitems
4+
*.txt

cleanenron.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
inputfile = open(ifile, "r")
1212
outputfile = open("enronemails.csv", "a")
1313

14+
print "Scanning " + ifile
15+
1416
vals={}
1517
for line in inputfile:
1618
splitline = line.split(":")
@@ -48,4 +50,4 @@
4850
outputfile.write(outputline)
4951
vals={}
5052
counter+=1
51-
print "Added " + str(counter) + " emails to enronemaiils.csv"
53+
print "Added " + str(counter) + " emails to enronemails.csv"

csv2neo4j.py

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
from py2neo import Graph, Path
22
import parser
33
import json
4+
import re
45

56
graph = Graph()
67
inputfile = open("enronemails.csv", "r")
8+
orphanemailfile = open("orphanemails.csv","r")
9+
queryfile = open("queryfile.txt","w")
710

11+
orphanemails = {}
12+
for orphan in orphanemailfile.readlines():
13+
orphaninfo = orphan.split("|")
14+
orphanemails[orphaninfo[0]]=orphaninfo[1]
815
tx = graph.cypher.begin()
916
i=0
1017
for line in inputfile.readlines():
@@ -18,26 +25,51 @@
1825
cc=parser.parse_email(data[3])
1926
bcc=parser.parse_email(data[4])
2027
subject=data[5].rstrip().lstrip().replace('\"','')
21-
#Replace with neo4j links
22-
#Create all nodes in from and to
23-
#Create links with message subject and date
24-
#Probably best to try in n4j console
28+
for address in fromval:
29+
if (not "email" in address):
30+
address["email"]=orphanemails[address["name"]].lstrip().rstrip()
31+
for address in to:
32+
if (not "email" in address):
33+
address["email"]=orphanemails[address["name"]].lstrip().rstrip()
34+
for address in cc:
35+
if (not "email" in address):
36+
address["email"]=orphanemails[address["name"]].lstrip().rstrip()
37+
for address in bcc:
38+
if (not "email" in address):
39+
address["email"]=orphanemails[address["name"]].lstrip().rstrip()
40+
2541
if (len(fromval)>=1):
26-
query = "MERGE (f"+ str(i) +":Person {name:\"" + fromval[0]["name"] + "\""
27-
if ("email" in fromval[0]):
28-
query += ", email:\"" + fromval[0]["email"] + "\""
29-
query += "}) "
42+
query = "MERGE (f"+ str(i) +":Person {email:\"" + fromval[0]["email"] + "\"}) "
43+
if ("name" in fromval[0]):
44+
query +="ON CREATE SET f"+ str(i) + " += {name:\"" + fromval[0]["name"] + "\"} "
3045
j=0
3146
for recipient in to:
32-
query += "MERGE (t"+ str(i) + "_" + str(j) +":Person {name:\"" + recipient["name"] + "\""
33-
if ("email" in recipient):
34-
query += ", email:\"" + recipient["email"] + "\""
35-
query += "}) "
47+
query += "MERGE (t"+ str(i) + "_" + str(j) +":Person {email:\"" + recipient["email"] + "\"}) "
48+
if ("name" in recipient):
49+
query +="ON CREATE SET t"+ str(i) + "_" + str(j) + " += {name:\"" + recipient["name"] + "\"} "
3650
query += "CREATE (f" + str(i) + ")-[:emailed {date:\'" + parser.cleantext(date) + "\', subject:\'" + parser.cleantext(subject) + "\', method:\'to\'}]->(t" + str(i) + "_" + str(j) + ") "
3751
j+=1
52+
for recipient in cc:
53+
query += "MERGE (cc"+ str(i) + "_" + str(j) +":Person {email:\"" + recipient["email"] + "\"}) "
54+
if ("name" in recipient):
55+
query +="ON CREATE SET cc"+ str(i) + "_" + str(j) + " += {name:\"" + recipient["name"] + "\"} "
56+
query += "CREATE (f" + str(i) + ")-[:emailed {date:\'" + parser.cleantext(date) + "\', subject:\'" + parser.cleantext(subject) + "\', method:\'cc\'}]->(cc" + str(i) + "_" + str(j) + ") "
57+
j+=1
58+
for recipient in bcc:
59+
query += "MERGE (bcc"+ str(i) + "_" + str(j) +":Person {email:\"" + recipient["email"] + "\"}) "
60+
if ("name" in recipient):
61+
query +="ON CREATE SET bcc"+ str(i) + "_" + str(j) + " += {name:\"" + recipient["name"] + "\"} "
62+
query += "CREATE (f" + str(i) + ")-[:emailed {date:\'" + parser.cleantext(date) + "\', subject:\'" + parser.cleantext(subject) + "\', method:\'bcc\'}]->(bcc" + str(i) + "_" + str(j) + ") "
63+
j+=1
64+
65+
query = query.encode("utf-8")
66+
queryfile.write(query + "\n")
67+
# tx.append(query)
68+
# tx.process()
3869
i+=1
39-
tx.append(query)
40-
tx.commit()
70+
# tx.commit()
71+
72+
queryfile.close()
4173
inputfile.close()
42-
#This will require holding the entire thing in memory, it looks like.
43-
#Will it just be easier to put things direclty in from the CSV? Might as well...
74+
orphanemailfile.close()
75+

gatherorphans.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
import json
22
import sys
33
import parser
4+
import re
45

56
inputfile = open("enronemails.csv", "r")
67
outputfile = open("orphanemails.csv", "w")
78
orphanemails={}
8-
print "Gathering Orphan Emails"
9+
print "Gathering orphan emails"
910

1011
def check_for_orphans(addresses):
1112
for address in addresses:
12-
if (not "email" in address):
13-
orphanemails[address["name"]] = ''
14-
elif (address["name"] in orphanemails):
15-
orphanemails[address["name"]] = address['email']
13+
if ("name" in address):
14+
if (not "email" in address):
15+
orphanemails[address["name"]] = ''
16+
elif (address["name"] in orphanemails):
17+
orphanemails[address["name"]] = address['email']
1618

1719
for line in inputfile.readlines():
1820
data = line.split("|")
@@ -26,6 +28,8 @@ def check_for_orphans(addresses):
2628
check_for_orphans(parser.parse_email(data[4]))
2729
inputfile.close()
2830
for orphan in orphanemails.keys():
31+
if (orphanemails[orphan] == ''):
32+
orphanemails[orphan] = (".".join(re.findall("[\w]+",orphan)) + "@noemail.com").lower()
2933
outputfile.write(orphan + "|" + orphanemails[orphan] + "\n")
3034
outputfile.close()
3135
print('Extracted orphan emails')

loadquery.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from py2neo import Graph
2+
import time
3+
4+
queryfile = open("queryfile.txt", "r")
5+
graph = Graph()
6+
i=0;
7+
j=0;
8+
resumepoint = 0
9+
for line in queryfile:
10+
if resumepoint:
11+
if i==0:
12+
tx=graph.cypher.begin()
13+
tx.append(line)
14+
tx.process()
15+
if i<100:
16+
i+=1
17+
j+=1
18+
elif i>=100:
19+
tx.commit()
20+
i=0
21+
print "Committing transaction " + str(j)
22+
else:
23+
if (i>235400):
24+
resumepoint = 1
25+
i=0
26+
else:
27+
i+=1
28+
print "Donezo"

parser.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,39 +6,48 @@ def parse_email(entry):
66
carryover_name = 0
77
if (not entry):
88
return ''
9-
address_records = entry.split(",")
9+
if (len(re.findall(";",entry))>0):
10+
address_records = re.split("[;]",entry)
11+
elif (len(re.findall("[\w]+",entry))>3):
12+
address_records = re.split("[\\,]",entry)
13+
else:
14+
address_records = [entry]
1015
for address in address_records:
11-
if (not carryover_name):
12-
addressinfo = {}
16+
addressinfo = {}
1317
address.lstrip().rstrip()
14-
address = re.sub("[\\\"<>\\\']",'',address)
18+
address = address.split(" on ")[0]
19+
# Clean extraneous strings.
20+
address = address.replace("@ENRON", '')
21+
address = address.replace("@ ENRON", '')
22+
address = address.replace("(E-mail)", '')
23+
address = address.replace("\(E-mail\)", '')
24+
address = address.replace("\(E-mail\)", '')
25+
address = re.sub("[\\\"<>\\\'\\\\:]",'',address)
1526
for item in address.split(" "):
1627
if (validate_email(item)):
17-
addressinfo["email"]=cleantext(item)
28+
addressinfo["email"]=cleantext(item).rstrip().lstrip().lower()
1829
elif (re.search('/O=ENRON/OU=NA/CN=RECIPIENTS/CN=',item)):
1930
item = item.replace('/O=ENRON/OU=NA/CN=RECIPIENTS/CN=', '')
2031
item = item.lower() + "@enron.com"
21-
addressinfo["email"]=item
32+
addressinfo["email"]=item.rstrip().lstrip().lower()
2233
elif (re.search('\\[mailto\\:', item)):
2334
item = item.replace('[mailto:', '')
2435
item = item.replace(']','')
25-
addressinfo["email"]=cleantext(item)
36+
addressinfo["email"]=cleantext(item).rstrip().lstrip().lower()
2637
elif ("name" in addressinfo):
2738
addressinfo["name"]+=cleantext(item) + " "
2839
else:
2940
addressinfo["name"]=cleantext(item) + " "
30-
addressinfo["name"] = addressinfo["name"].rstrip().lstrip()
31-
if (not "email" in addressinfo):
32-
# orphan_addresses[addressinfo["name"]]=''
33-
carryover_name = 1
34-
if (address_records[-1] is address):
35-
addresses.append(addressinfo)
36-
else:
37-
# if(addressinfo["name"] in orphan_addresses):
38-
# orphan_addresses[addressinfo["name"]] = addressinfo["email"]
39-
addresses.append(addressinfo)
40-
carryover_name = 0
41+
if ("name" in addressinfo):
42+
addressinfo["name"] = addressinfo["name"].rstrip().lstrip()
43+
addresses.append(addressinfo)
4144
return addresses
4245

4346
def cleantext(string):
44-
return re.sub("\'","\\\'", string)
47+
string = string.replace("//","////")
48+
# \\,\\.\\/\\!@#$%\\^\\&*()\\-\\+\\|\\[\\]}{ _\\:;\'\"<>
49+
funkychars = "[^a-zA-Z0-9\,\.\/\\!@#\$\?%\^\&\*\(\)\-\+\|\[\]\}\{= _:;\'\"\<\>]"
50+
if (len(re.findall( funkychars,string)) > 0):
51+
print re.findall(funkychars,string)
52+
string = re.sub(funkychars, " ",string)
53+
return string.encode("string_escape")

parser.pyc

583 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)