Created
August 30, 2014 19:22
-
-
Save SuperDoxin/8fa4a2ccd10a0c34fca1 to your computer and use it in GitHub Desktop.
split an email file into months and clean it up in general
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import os.path | |
import sys | |
import json | |
import email.parser | |
import email.generator | |
import parsedatetime.parsedatetime as pdt | |
import cStringIO as StringIO | |
import time | |
#import parsedatetime.parsedatetime_consts as pdc | |
#c=pdc.Constants() | |
p=pdt.Calendar()#c | |
encoding="UTF-16LE" | |
outencoding="UTF-8" | |
keepheaders=[u'From', u'Newsgroups', u'Subject', u'Keywords', u'Message-ID', u'Date', u'References', u'Sender', u'Organization', u'Lines'] | |
msglist=[] | |
def handlemail(data): | |
if len(data)==0: | |
return | |
f=email.parser.FeedParser() | |
f.feed(data) | |
msg=f.close() | |
msglist.append(msg) | |
def msg_getdate(msg): | |
if "Date" not in msg: | |
msg["Date"]=msg["X-Google-ArrivalTime"] | |
if msg["Date"]==None: | |
return None | |
return p.parse(msg["Date"]) | |
with codecs.open("maildump",encoding=encoding) as fid: | |
#skip BOM | |
fid.read(1) | |
data=fid.read() | |
currentmail=[] | |
for line in data.split("\r\n"): | |
if line.startswith("From "): | |
handlemail('\r\n'.join(currentmail)) | |
currentmail=[] | |
else: | |
currentmail.append(line) | |
print "parsed",len(msglist),"messages" | |
msglist.sort(key=msg_getdate) | |
for msg in msglist: | |
date=msg_getdate(msg) | |
newheaders={} | |
if date==None: | |
outfname="dump/nodate.mail" | |
else: | |
outfname="dump/{}/{}.mail".format(date[0][0],date[0][1]) | |
try: | |
os.makedirs("dump/{}".format(date[0][0])) | |
except: | |
pass | |
newheaders["Date"]=time.strftime("%Y-%m-%d %H:%M:%S",msg_getdate(msg)[0]) | |
with codecs.open(outfname,"a",outencoding) as fid: | |
for header in msg.keys(): | |
if header in keepheaders: | |
newheaders[header]=msg[header] | |
for header in msg.keys(): | |
del msg[header] | |
#msg.update(newheaders) | |
for k,v in newheaders.items(): | |
msg[k]=v | |
tfid=StringIO.StringIO() | |
g=email.generator.Generator(tfid) | |
g.flatten(msg) | |
#tfid.close() | |
fid.write(tfid.getvalue()) | |
fid.write("\r\n\r\n---------\r\n\r\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment