Created
February 6, 2014 16:02
-
-
Save danielballan/8847070 to your computer and use it in GitHub Desktop.
Gmail Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import re | |
from dateutil.parser import parse | |
import email | |
import html2text | |
import MySQLdb | |
from itertools import * | |
def traverse_message(msg): | |
"Traverse a multipart email, which is organized like a tree." | |
if msg.is_multipart(): | |
for part in msg.walk(): | |
yield part | |
else: | |
yield msg | |
def open_email(filename): | |
"Return an email message object." | |
read_buffer = open(filename, 'rb').read() | |
msg = email.message_from_string(read_buffer) | |
return msg | |
def parse_email_header(msg): | |
"Return (sender, [recipients])." | |
sender = email.utils.parseaddr(msg.get_all('from', None)) | |
tos = msg.get_all('to', []) | |
ccs = msg.get_all('cc', []) | |
resent_tos = msg.get_all('resent-to', []) | |
resent_ccs = msg.get_all('resent-cc', []) | |
recipients = email.utils.getaddresses(tos + ccs + | |
resent_tos + resent_ccs) | |
date = msg.get_all('date', None)[0] | |
return date, sender, recipients | |
def read_email_body(msg): | |
"Return a long string of plain text." | |
parts = list(traverse_message(msg)) | |
content_types = [p.get_content_type() for p in parts] | |
if 'text/plain' in content_types: | |
return parts[content_types.index('text/plain')].get_payload() | |
elif 'text/html' in content_types: | |
html = parts[content_types.index('text/html')].get_payload() | |
return html2text.html2text(html) | |
else: | |
return None ## No plaintext or html parts. | |
def list_words(text): | |
"Return a list of alphabetic words in text, lowercase." | |
return re.findall('[a-z]+', text.lower()) | |
def pairwise(iterable): | |
"s -> (s0,s1), (s1,s2), (s2, s3), ..." | |
a, b = tee(iterable) | |
next(b, None) | |
return izip(a, b) | |
def three_wise(iterable): | |
"s -> (s0,s1,s2), (s1,s2,s3), (s2, s3, s4), ..." | |
a, b, c = tee(iterable, 3) | |
next(b, None) | |
next(c, None); next(c, None) | |
return izip(a, b, c) | |
def insert_tokens(tokens, msg_id, conn): | |
"""Insert tokens into the database, in order. The database | |
will number them sequentially, thus recording their ordering.""" | |
c = conn.cursor() | |
c.execute("CREATE TEMPORARY TABLE NewTokens ( " + | |
"msg_id smallint unsigned, " + | |
"pos smallint unsigned auto_increment, " + | |
"token varchar(31), " + | |
"primary key (pos))") | |
c.executemany("INSERT INTO NewTokens (token) VALUES (%s)", | |
[(token,) for token in tokens]) | |
c.execute("UPDATE NewTokens SET msg_id=%s", (msg_id, )) | |
c.execute("INSERT INTO AllTokens (msg_id, pos, token) SELECT msg_id, pos, token FROM NewTokens") | |
c.execute("commit") | |
c.close() | |
def log_message(filename, date, sender, conn): | |
"""Check database for this message. Return false if it is found. | |
If not, enter it and return the msd_id assigned by the DB.""" | |
c = conn.cursor() | |
c.execute("SELECT filename FROM Messages WHERE filename=%s", | |
filename) | |
if c.rowcount > 0: | |
## A message with this filename has already been entered. | |
c.close() | |
return False | |
c.close() | |
c = conn.cursor() | |
c.execute("INSERT INTO Messages (filename, datetime, day_of_week, " + | |
"sender) " + | |
"VALUES (%s, %s, %s, %s)", | |
(filename, parse(date), date[:3], sender[1])) | |
msg_id = c.lastrowid | |
c.execute("commit") | |
c.close() | |
return msg_id | |
def main(): | |
conn = MySQLdb.connect(db='corpus2', user='py') | |
filename = sys.argv[1] | |
msg = open_email(filename) | |
date, sender, recipients = parse_email_header(msg) | |
msg_id = log_message(filename, date, sender, conn) | |
if msg_id: | |
text = read_email_body(msg) | |
tokens = list_words(text) | |
insert_tokens(tokens, msg_id, conn) | |
print 'Inserted: ', filename | |
else: | |
print 'Redundant: ', filename | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment