Piping data to neo4J

davidjay-samsung · davidjay-samsung · commit 45a99c25cca0 · 2015-05-26T23:52:06.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 *.csv
 Inbox
 sentitems
+*.txt
diff --git a/cleanenron.py b/cleanenron.py
@@ -11,6 +11,8 @@
 inputfile = open(ifile, "r")
 outputfile = open("enronemails.csv", "a")
 
+print "Scanning " + ifile
+
 vals={}
 for line in inputfile:
 	splitline = line.split(":")
@@ -48,4 +50,4 @@
 		outputfile.write(outputline)
 		vals={}
 		counter+=1
-print "Added " + str(counter) + " emails to enronemaiils.csv"
+print "Added " + str(counter) + " emails to enronemails.csv"
diff --git a/csv2neo4j.py b/csv2neo4j.py
@@ -1,10 +1,17 @@
 from py2neo import Graph, Path
 import parser
 import json
+import re
 
 graph = Graph()
 inputfile = open("enronemails.csv", "r")
+orphanemailfile = open("orphanemails.csv","r")
+queryfile = open("queryfile.txt","w")
 
+orphanemails = {}
+for orphan in orphanemailfile.readlines():
+	orphaninfo = orphan.split("|")
+	orphanemails[orphaninfo[0]]=orphaninfo[1]
 tx = graph.cypher.begin()
 i=0
 for line in inputfile.readlines():
@@ -18,26 +25,51 @@
 	cc=parser.parse_email(data[3])
 	bcc=parser.parse_email(data[4])
 	subject=data[5].rstrip().lstrip().replace('\"','')
-	#Replace with neo4j links
-	#Create all nodes in from and to
-	#Create links with message subject and date
-	#Probably best to try in n4j console
+	for address in fromval:
+		if (not "email" in address):
+			address["email"]=orphanemails[address["name"]].lstrip().rstrip()
+	for address in to:
+		if (not "email" in address):
+			address["email"]=orphanemails[address["name"]].lstrip().rstrip()
+	for address in cc:
+		if (not "email" in address):
+			address["email"]=orphanemails[address["name"]].lstrip().rstrip()
+	for address in bcc:
+		if (not "email" in address):
+			address["email"]=orphanemails[address["name"]].lstrip().rstrip()
+
 	if (len(fromval)>=1):
-		query = "MERGE (f"+ str(i) +":Person {name:\"" + fromval[0]["name"] + "\""
-		if ("email" in fromval[0]):
-			query += ", email:\"" + fromval[0]["email"] + "\""
-		query += "}) "
+		query = "MERGE (f"+ str(i) +":Person {email:\"" + fromval[0]["email"] + "\"}) "
+		if ("name" in fromval[0]):
+			query +="ON CREATE SET f"+ str(i) + " += {name:\"" + fromval[0]["name"] + "\"} "
 	j=0
 	for recipient in to:
-		query += "MERGE (t"+ str(i) + "_" + str(j) +":Person {name:\"" + recipient["name"] + "\""
-		if ("email" in recipient):
-			query += ", email:\"" + recipient["email"] + "\""
-		query += "}) "
+		query += "MERGE (t"+ str(i) + "_" + str(j) +":Person {email:\"" + recipient["email"] + "\"}) "
+		if ("name" in recipient):
+			query +="ON CREATE SET t"+ str(i) + "_" + str(j) + " += {name:\"" + recipient["name"] + "\"} "
 		query += "CREATE (f" + str(i) + ")-[:emailed {date:\'" + parser.cleantext(date) + "\', subject:\'" + parser.cleantext(subject) + "\', method:\'to\'}]->(t" + str(i) + "_" + str(j) + ") "
 		j+=1
+	for recipient in cc:
+		query += "MERGE (cc"+ str(i) + "_" + str(j) +":Person {email:\"" + recipient["email"] + "\"}) "
+		if ("name" in recipient):
+			query +="ON CREATE SET cc"+ str(i) + "_" + str(j) + " += {name:\"" + recipient["name"] + "\"} "
+		query += "CREATE (f" + str(i) + ")-[:emailed {date:\'" + parser.cleantext(date) + "\', subject:\'" + parser.cleantext(subject) + "\', method:\'cc\'}]->(cc" + str(i) + "_" + str(j) + ") "
+		j+=1
+	for recipient in bcc:
+		query += "MERGE (bcc"+ str(i) + "_" + str(j) +":Person {email:\"" + recipient["email"] + "\"}) "
+		if ("name" in recipient):
+			query +="ON CREATE SET bcc"+ str(i) + "_" + str(j) + " += {name:\"" + recipient["name"] + "\"} "
+		query += "CREATE (f" + str(i) + ")-[:emailed {date:\'" + parser.cleantext(date) + "\', subject:\'" + parser.cleantext(subject) + "\', method:\'bcc\'}]->(bcc" + str(i) + "_" + str(j) + ") "
+		j+=1
+
+	query = query.encode("utf-8")
+	queryfile.write(query + "\n")
+	# tx.append(query)
+	# tx.process()
 	i+=1
-	tx.append(query)
-tx.commit()
+# tx.commit()
+
+queryfile.close()
 inputfile.close()
-#This will require holding the entire thing in memory, it looks like.
-#Will it just be easier to put things direclty in from the CSV? Might as well...
+orphanemailfile.close()
+
diff --git a/gatherorphans.py b/gatherorphans.py
@@ -1,18 +1,20 @@
 import json
 import sys
 import parser
+import re
 
 inputfile = open("enronemails.csv", "r")
 outputfile = open("orphanemails.csv", "w")
 orphanemails={}
-print "Gathering Orphan Emails"
+print "Gathering orphan emails"
 
 def check_for_orphans(addresses):
 	for address in addresses:
-		if (not "email" in address):
-			orphanemails[address["name"]] = ''
-		elif (address["name"] in orphanemails):
-			orphanemails[address["name"]] = address['email']
+		if ("name" in address):
+			if (not "email" in address):
+				orphanemails[address["name"]] = ''
+			elif (address["name"] in orphanemails):
+				orphanemails[address["name"]] = address['email']
 
 for line in inputfile.readlines():
 	data = line.split("|")
@@ -26,6 +28,8 @@ def check_for_orphans(addresses):
 	check_for_orphans(parser.parse_email(data[4]))
 inputfile.close()
 for orphan in orphanemails.keys():
+	if (orphanemails[orphan] == ''):
+		orphanemails[orphan] = (".".join(re.findall("[\w]+",orphan)) + "@noemail.com").lower()
 	outputfile.write(orphan + "|" + orphanemails[orphan] + "\n")
 outputfile.close()
 print('Extracted orphan emails')
diff --git a/loadquery.py b/loadquery.py
@@ -0,0 +1,28 @@
+from py2neo import Graph
+import time
+
+queryfile = open("queryfile.txt", "r")
+graph = Graph()
+i=0;
+j=0;
+resumepoint = 0
+for line in queryfile:
+	if resumepoint:
+		if i==0:
+			tx=graph.cypher.begin()		
+		tx.append(line)
+		tx.process()
+		if i<100:
+			i+=1
+			j+=1
+		elif i>=100:
+			tx.commit()
+			i=0
+			print "Committing transaction " + str(j)
+	else:
+		if (i>235400):
+			resumepoint = 1
+			i=0
+		else:
+			i+=1
+print "Donezo"
diff --git a/parser.py b/parser.py
@@ -6,39 +6,48 @@ def parse_email(entry):
 	carryover_name = 0
 	if (not entry):
 		return ''
-	address_records = entry.split(",") 
+	if (len(re.findall(";",entry))>0):
+		address_records = re.split("[;]",entry)
+	elif (len(re.findall("[\w]+",entry))>3):
+		address_records = re.split("[\\,]",entry)
+	else:
+		address_records = [entry]
 	for address in address_records:
-		if (not carryover_name):
-			addressinfo = {}
+		addressinfo = {}
 		address.lstrip().rstrip()
-		address = re.sub("[\\\"<>\\\']",'',address)
+		address = address.split(" on ")[0]
+		# Clean extraneous strings.
+		address = address.replace("@ENRON", '')
+		address = address.replace("@ ENRON", '')
+		address = address.replace("(E-mail)", '')
+		address = address.replace("\(E-mail\)", '')
+		address = address.replace("\(E-mail\)", '')
+		address = re.sub("[\\\"<>\\\'\\\\:]",'',address)
 		for item in address.split(" "):
 			if (validate_email(item)):
-				addressinfo["email"]=cleantext(item)
+				addressinfo["email"]=cleantext(item).rstrip().lstrip().lower()
 			elif (re.search('/O=ENRON/OU=NA/CN=RECIPIENTS/CN=',item)):
 				item = item.replace('/O=ENRON/OU=NA/CN=RECIPIENTS/CN=', '')
 				item = item.lower() + "@enron.com"
-				addressinfo["email"]=item
+				addressinfo["email"]=item.rstrip().lstrip().lower()
 			elif (re.search('\\[mailto\\:', item)):
 				item = item.replace('[mailto:', '')
 				item = item.replace(']','')
-				addressinfo["email"]=cleantext(item)
+				addressinfo["email"]=cleantext(item).rstrip().lstrip().lower()
 			elif ("name" in addressinfo):
 				addressinfo["name"]+=cleantext(item) + " "
 			else:
 				addressinfo["name"]=cleantext(item) + " "
-		addressinfo["name"] = addressinfo["name"].rstrip().lstrip()
-		if (not "email" in addressinfo):
-			# orphan_addresses[addressinfo["name"]]=''
-			carryover_name = 1
-			if (address_records[-1] is address):
-				addresses.append(addressinfo)
-		else:
-			# if(addressinfo["name"] in orphan_addresses):
-				# orphan_addresses[addressinfo["name"]] = addressinfo["email"]
-			addresses.append(addressinfo)
-			carryover_name = 0
+		if ("name" in addressinfo):
+			addressinfo["name"] = addressinfo["name"].rstrip().lstrip()
+		addresses.append(addressinfo)
 	return addresses
 
 def cleantext(string):
-	return re.sub("\'","\\\'", string)
+	string = string.replace("//","////")
+	# \\,\\.\\/\\!@#$%\\^\\&*()\\-\\+\\|\\[\\]}{ _\\:;\'\"<>
+	funkychars = "[^a-zA-Z0-9\,\.\/\\!@#\$\?%\^\&\*\(\)\-\+\|\[\]\}\{= _:;\'\"\<\>]"
+	if (len(re.findall( funkychars,string)) > 0):
+		print re.findall(funkychars,string)
+		string = re.sub(funkychars, " ",string)
+	return string.encode("string_escape")
diff --git a/parser.pyc b/parser.pyc

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 *.csv
 Inbox
 sentitems
 +*.txt