Skip to content

Instantly share code, notes, and snippets.

@GrantTrebbin
Created October 19, 2015 09:50
Show Gist options
  • Save GrantTrebbin/2c575c5932b411b589a5 to your computer and use it in GitHub Desktop.
Save GrantTrebbin/2c575c5932b411b589a5 to your computer and use it in GitHub Desktop.
Crudely analyse the emails in an mbox file to generate statistics
import mailbox
from email.header import decode_header
import re
import itertools
regex = re.compile('[^a-zA-Z0-9]')
words = []
for message in mailbox.mbox('Inbox.mbox'):
subject, encoding = decode_header(message['subject'])[0]
if subject:
if (encoding is None):
sentence = subject.split()
else:
sentence = subject.decode('ascii', 'ignore').split()
for b in sentence:
words.append(regex.sub('',b).lower())
froms, encoding = decode_header(message['from'])[0]
if froms:
if (encoding is None):
sender = froms.split()
else:
sender = froms.decode('ascii', 'ignore').split()
for b in sender:
words.append(regex.sub('',b).lower())
freq = [(len(list(v)), key) for (key, v) in itertools.groupby(sorted(words))]
freq.sort(key=lambda tup: tup[0], reverse=True)
for a in freq:
print (a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment