Skip to content

Instantly share code, notes, and snippets.

@SuperDoxin
Created August 30, 2014 19:22
Show Gist options
  • Save SuperDoxin/8fa4a2ccd10a0c34fca1 to your computer and use it in GitHub Desktop.
Save SuperDoxin/8fa4a2ccd10a0c34fca1 to your computer and use it in GitHub Desktop.
split an email file into months and clean it up in general
import codecs
import os.path
import sys
import json
import email.parser
import email.generator
import parsedatetime.parsedatetime as pdt
import cStringIO as StringIO
import time
#import parsedatetime.parsedatetime_consts as pdc
#c=pdc.Constants()
p=pdt.Calendar()#c
encoding="UTF-16LE"
outencoding="UTF-8"
keepheaders=[u'From', u'Newsgroups', u'Subject', u'Keywords', u'Message-ID', u'Date', u'References', u'Sender', u'Organization', u'Lines']
msglist=[]
def handlemail(data):
if len(data)==0:
return
f=email.parser.FeedParser()
f.feed(data)
msg=f.close()
msglist.append(msg)
def msg_getdate(msg):
if "Date" not in msg:
msg["Date"]=msg["X-Google-ArrivalTime"]
if msg["Date"]==None:
return None
return p.parse(msg["Date"])
with codecs.open("maildump",encoding=encoding) as fid:
#skip BOM
fid.read(1)
data=fid.read()
currentmail=[]
for line in data.split("\r\n"):
if line.startswith("From "):
handlemail('\r\n'.join(currentmail))
currentmail=[]
else:
currentmail.append(line)
print "parsed",len(msglist),"messages"
msglist.sort(key=msg_getdate)
for msg in msglist:
date=msg_getdate(msg)
newheaders={}
if date==None:
outfname="dump/nodate.mail"
else:
outfname="dump/{}/{}.mail".format(date[0][0],date[0][1])
try:
os.makedirs("dump/{}".format(date[0][0]))
except:
pass
newheaders["Date"]=time.strftime("%Y-%m-%d %H:%M:%S",msg_getdate(msg)[0])
with codecs.open(outfname,"a",outencoding) as fid:
for header in msg.keys():
if header in keepheaders:
newheaders[header]=msg[header]
for header in msg.keys():
del msg[header]
#msg.update(newheaders)
for k,v in newheaders.items():
msg[k]=v
tfid=StringIO.StringIO()
g=email.generator.Generator(tfid)
g.flatten(msg)
#tfid.close()
fid.write(tfid.getvalue())
fid.write("\r\n\r\n---------\r\n\r\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment