Skip to content

Instantly share code, notes, and snippets.

@drott
Created November 29, 2013 12:25
Show Gist options
  • Save drott/7704974 to your computer and use it in GitHub Desktop.
Save drott/7704974 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from __future__ import division
import mailbox
import re
import pprint
import dateutil.parser
import datetime
junk_box= mailbox.Maildir("Junkmail")
inbox = mailbox.Maildir("Inbox")
def count_histogram(matches, histogram_dict):
rule_hit_histogram = histogram_dict
for hit in matches:
if hit in rule_hit_histogram:
rule_hit_histogram[hit] = rule_hit_histogram[hit] + 1
else:
rule_hit_histogram[hit] = 1
return rule_hit_histogram
def get_spam_histogram():
spam_histogram = dict()
spam_messages = 0
for message in junk_box:
if "X-Spam-Report" in message:
spam_messages += 1
rule_matches = re.findall('^ -?[0-9]*\.[0-9]* ([A-Z0-9_]+).*$', message["X-Spam-Report"], re.M)
count_histogram(rule_matches, spam_histogram)
return (spam_histogram, spam_messages)
def get_ham_histogram():
newer_messages = 0
ham_histogram = dict()
for message in inbox:
if not "Date" in message:
continue
try:
message_date = dateutil.parser.parse(message["Date"])
except:
continue
message_date = message_date.replace(tzinfo=None)
cutoff_date = datetime.datetime.strptime("11/28/12", "%m/%d/%y")
if message_date > cutoff_date:
newer_messages += 1
if "X-Spam-Report" in message:
rule_matches = re.findall('^ -?[0-9]*\.[0-9]* ([A-Z0-9_]+).*$', message["X-Spam-Report"], re.M)
count_histogram(rule_matches, ham_histogram)
return (ham_histogram, newer_messages)
ham = get_ham_histogram()
spam = get_spam_histogram()
ham_scaled = {k: v/ham[1] for k,v in ham[0].iteritems()}
spam_scaled = {k: v/spam[1] for k,v in spam[0].iteritems()}
for item in sorted(spam_scaled.items(), key=lambda x:x[1],reverse=True):
ham_percentage = 0
if item[0] in ham_scaled:
ham_percentage = ham_scaled[item[0]]
print '{0: <40} {1:.2%} {2:.2%}'.format(item[0], item[1], float(ham_percentage))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment