Last active
February 5, 2017 18:12
-
-
Save inkhorn/7252798 to your computer and use it in GitHub Desktop.
Script to read, filter, and output all enron emails into many files in one directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docs = [] | |
from os import listdir, chdir | |
import re | |
# Here's my attempt at coming up with regular expressions to filter out | |
# parts of the enron emails that I deem as useless. | |
email_pat = re.compile(".+@.+") | |
to_pat = re.compile("To:.+\n") | |
cc_pat = re.compile("cc:.+\n") | |
subject_pat = re.compile("Subject:.+\n") | |
from_pat = re.compile("From:.+\n") | |
sent_pat = re.compile("Sent:.+\n") | |
received_pat = re.compile("Received:.+\n") | |
ctype_pat = re.compile("Content-Type:.+\n") | |
reply_pat = re.compile("Reply- Organization:.+\n") | |
date_pat = re.compile("Date:.+\n") | |
xmail_pat = re.compile("X-Mailer:.+\n") | |
mimver_pat = re.compile("MIME-Version:.+\n") | |
contentinfo_pat = re.compile("----------------------------------------.+----------------------------------------") | |
forwardedby_pat = re.compile("----------------------.+----------------------") | |
caution_pat = re.compile('''\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*.+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*''') | |
privacy_pat = re.compile(" _______________________________________________________________.+ _______________________________________________________________") | |
# The enron emails are in 151 directories representing each each senior management | |
# employee whose email account was entered into the dataset. | |
# The task here is to go into each folder, and enter each | |
# email text file into one long nested list. | |
# I've used readlines() to read in the emails because read() | |
# didn't seem to work with these email files. | |
chdir("/home/inkhorn/enron") | |
names = [d for d in listdir(".") if "." not in d] | |
for name in names: | |
chdir("/home/inkhorn/enron/%s" % name) | |
subfolders = listdir('.') | |
sent_dirs = [n for n, sf in enumerate(subfolders) if "sent" in sf] | |
sent_dirs_words = [subfolders[i] for i in sent_dirs] | |
for d in sent_dirs_words: | |
chdir('/home/inkhorn/enron/%s/%s' % (name,d)) | |
file_list = listdir('.') | |
docs.append([" ".join(open(f, 'r').readlines()) for f in file_list if "." in f]) | |
# Here i go into each email from each employee, try to filter out all the useless stuff, | |
# then paste the email into one long flat list. This is probably inefficient, but oh well - python | |
# is pretty fast anyway! | |
docs_final = [] | |
for subfolder in docs: | |
for email in subfolder: | |
if ".nsf" in email: | |
etype = ".nsf" | |
elif ".pst" in email: | |
etype = ".pst" | |
email_new = email[email.find(etype)+4:] | |
email_new = to_pat.sub('', email_new) | |
email_new = cc_pat.sub('', email_new) | |
email_new = subject_pat.sub('', email_new) | |
email_new = from_pat.sub('', email_new) | |
email_new = sent_pat.sub('', email_new) | |
email_new = email_pat.sub('', email_new) | |
if "-----Original Message-----" in email_new: | |
email_new = email_new.replace("-----Original Message-----","") | |
email_new = ctype_pat.sub('', email_new) | |
email_new = reply_pat.sub('', email_new) | |
email_new = date_pat.sub('', email_new) | |
email_new = xmail_pat.sub('', email_new) | |
email_new = mimver_pat.sub('', email_new) | |
email_new = contentinfo_pat.sub('', email_new) | |
email_new = forwardedby_pat.sub('', email_new) | |
email_new = caution_pat.sub('', email_new) | |
email_new = privacy_pat.sub('', email_new) | |
docs_final.append(email_new) | |
# Here I proceed to dump each and every email into about 126 thousand separate | |
# txt files in a newly created 'data' directory. This gets it ready for entry into a Corpus using the tm (textmining) | |
# package from R. | |
for n, doc in enumerate(docs_final): | |
outfile = open("/home/inkhorn/enron/data/%s.txt" % n,'w') | |
outfile.write(doc) | |
outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for doing this . I have analysed and ran it but
its is not working and its throwing this error
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfe in position 0: invalid start byte
there are some files which are encoded differently and some other files are normal .
Have you ran this solution on full data set ?