Created
November 5, 2013 03:21
-
-
Save inkhorn/7313350 to your computer and use it in GitHub Desktop.
Enron Corpus Processing, version 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docs = [] | |
from os import listdir, chdir | |
import re | |
# Here's the section where I try to filter useless stuff out. | |
# Notice near the end all of the regex patterns where I've called | |
# "re.DOTALL". This is pretty key here. What it means is that the | |
# .+ I have referenced within the regex pattern should be able to | |
# pick up alphanumeric characters, in addition to newline characters | |
# (\n). Since I did not have this in the first version, the cautionary/ | |
# privacy messages people were pasting at the ends of their emails | |
# were not getting filtered out and were being entered into the | |
# LDA analysis, putting noise in the topics that were modelled. | |
email_pat = re.compile(".+@.+") | |
to_pat = re.compile("To:.+\n") | |
cc_pat = re.compile("cc:.+\n") | |
subject_pat = re.compile("Subject:.+\n") | |
from_pat = re.compile("From:.+\n") | |
sent_pat = re.compile("Sent:.+\n") | |
received_pat = re.compile("Received:.+\n") | |
ctype_pat = re.compile("Content-Type:.+\n") | |
reply_pat = re.compile("Reply- Organization:.+\n") | |
date_pat = re.compile("Date:.+\n") | |
xmail_pat = re.compile("X-Mailer:.+\n") | |
mimver_pat = re.compile("MIME-Version:.+\n") | |
dash_pat = re.compile("--+.+--+", re.DOTALL) | |
star_pat = re.compile('\*\*+.+\*\*+', re.DOTALL) | |
uscore_pat = re.compile(" __+.+__+", re.DOTALL) | |
equals_pat = re.compile("==+.+==+", re.DOTALL) | |
# (the below is the same note as before) | |
# The enron emails are in 151 directories representing each each senior management | |
# employee whose email account was entered into the dataset. | |
# The task here is to go into each folder, and enter each | |
# email text file into one long nested list. | |
# I've used readlines() to read in the emails because read() | |
# didn't seem to work with these email files. | |
chdir("/home/inkhorn/enron") | |
names = [d for d in listdir(".") if "." not in d] | |
for name in names: | |
chdir("/home/inkhorn/enron/%s" % name) | |
subfolders = listdir('.') | |
sent_dirs = [n for n, sf in enumerate(subfolders) if "sent" in sf] | |
sent_dirs_words = [subfolders[i] for i in sent_dirs] | |
for d in sent_dirs_words: | |
chdir('/home/inkhorn/enron/%s/%s' % (name,d)) | |
file_list = listdir('.') | |
docs.append([" ".join(open(f, 'r').readlines()) for f in file_list if "." in f]) | |
# (the below is the same note as before) | |
# Here i go into each email from each employee, try to filter out all the useless stuff, | |
# then paste the email into one long flat list. This is probably inefficient, but oh well - python | |
# is pretty fast anyway! | |
docs_final = [] | |
for subfolder in docs: | |
for email in subfolder: | |
if ".nsf" in email: | |
etype = ".nsf" | |
elif ".pst" in email: | |
etype = ".pst" | |
email_new = email[email.find(etype)+4:] | |
email_new = to_pat.sub('', email_new) | |
email_new = cc_pat.sub('', email_new) | |
email_new = subject_pat.sub('', email_new) | |
email_new = from_pat.sub('', email_new) | |
email_new = sent_pat.sub('', email_new) | |
email_new = received_pat.sub('', email_new) | |
email_new = email_pat.sub('', email_new) | |
email_new = ctype_pat.sub('', email_new) | |
email_new = reply_pat.sub('', email_new) | |
email_new = date_pat.sub('', email_new) | |
email_new = xmail_pat.sub('', email_new) | |
email_new = mimver_pat.sub('', email_new) | |
email_new = dash_pat.sub('', email_new) | |
email_new = star_pat.sub('', email_new) | |
email_new = uscore_pat.sub('', email_new) | |
email_new = equals_pat.sub('', email_new) | |
docs_final.append(email_new) | |
# (the below is the same note as before) | |
# Here I proceed to dump each and every email into about 126 thousand separate | |
# txt files in a newly created 'data' directory. This gets it ready for entry into a Corpus using the tm (textmining) | |
# package from R. | |
for n, doc in enumerate(docs_final): | |
outfile = open("/home/inkhorn/enron/data/%s.txt" % n,'w') | |
outfile.write(doc) | |
outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment