Skip to content

Instantly share code, notes, and snippets.

@danielballan
Created February 6, 2014 16:02
Show Gist options
  • Save danielballan/8847070 to your computer and use it in GitHub Desktop.
Save danielballan/8847070 to your computer and use it in GitHub Desktop.
Gmail Parser
#!/usr/bin/python
import sys
import re
from dateutil.parser import parse
import email
import html2text
import MySQLdb
from itertools import *
def traverse_message(msg):
"Traverse a multipart email, which is organized like a tree."
if msg.is_multipart():
for part in msg.walk():
yield part
else:
yield msg
def open_email(filename):
"Return an email message object."
read_buffer = open(filename, 'rb').read()
msg = email.message_from_string(read_buffer)
return msg
def parse_email_header(msg):
"Return (sender, [recipients])."
sender = email.utils.parseaddr(msg.get_all('from', None))
tos = msg.get_all('to', [])
ccs = msg.get_all('cc', [])
resent_tos = msg.get_all('resent-to', [])
resent_ccs = msg.get_all('resent-cc', [])
recipients = email.utils.getaddresses(tos + ccs +
resent_tos + resent_ccs)
date = msg.get_all('date', None)[0]
return date, sender, recipients
def read_email_body(msg):
"Return a long string of plain text."
parts = list(traverse_message(msg))
content_types = [p.get_content_type() for p in parts]
if 'text/plain' in content_types:
return parts[content_types.index('text/plain')].get_payload()
elif 'text/html' in content_types:
html = parts[content_types.index('text/html')].get_payload()
return html2text.html2text(html)
else:
return None ## No plaintext or html parts.
def list_words(text):
"Return a list of alphabetic words in text, lowercase."
return re.findall('[a-z]+', text.lower())
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return izip(a, b)
def three_wise(iterable):
"s -> (s0,s1,s2), (s1,s2,s3), (s2, s3, s4), ..."
a, b, c = tee(iterable, 3)
next(b, None)
next(c, None); next(c, None)
return izip(a, b, c)
def insert_tokens(tokens, msg_id, conn):
"""Insert tokens into the database, in order. The database
will number them sequentially, thus recording their ordering."""
c = conn.cursor()
c.execute("CREATE TEMPORARY TABLE NewTokens ( " +
"msg_id smallint unsigned, " +
"pos smallint unsigned auto_increment, " +
"token varchar(31), " +
"primary key (pos))")
c.executemany("INSERT INTO NewTokens (token) VALUES (%s)",
[(token,) for token in tokens])
c.execute("UPDATE NewTokens SET msg_id=%s", (msg_id, ))
c.execute("INSERT INTO AllTokens (msg_id, pos, token) SELECT msg_id, pos, token FROM NewTokens")
c.execute("commit")
c.close()
def log_message(filename, date, sender, conn):
"""Check database for this message. Return false if it is found.
If not, enter it and return the msd_id assigned by the DB."""
c = conn.cursor()
c.execute("SELECT filename FROM Messages WHERE filename=%s",
filename)
if c.rowcount > 0:
## A message with this filename has already been entered.
c.close()
return False
c.close()
c = conn.cursor()
c.execute("INSERT INTO Messages (filename, datetime, day_of_week, " +
"sender) " +
"VALUES (%s, %s, %s, %s)",
(filename, parse(date), date[:3], sender[1]))
msg_id = c.lastrowid
c.execute("commit")
c.close()
return msg_id
def main():
conn = MySQLdb.connect(db='corpus2', user='py')
filename = sys.argv[1]
msg = open_email(filename)
date, sender, recipients = parse_email_header(msg)
msg_id = log_message(filename, date, sender, conn)
if msg_id:
text = read_email_body(msg)
tokens = list_words(text)
insert_tokens(tokens, msg_id, conn)
print 'Inserted: ', filename
else:
print 'Redundant: ', filename
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment