Skip to content

Instantly share code, notes, and snippets.

@corydolphin
Created February 3, 2012 07:00
Show Gist options
  • Save corydolphin/1728599 to your computer and use it in GitHub Desktop.
Save corydolphin/1728599 to your computer and use it in GitHub Desktop.
Quick script to parse mailbox archives to grab plaintext information
#!/usr/bin/env python
"""
parseEmail.py: Parses mbox-formatted email boxes for the interesting bits
Usage: ./parseEmail.py dir
Where dir is a directory containing mbox files
"""
import mailbox
import os
import sys
import re
def parseDate(aStr):
#'Tue, 31 Aug 2010 20:53:12 -0400'
#this is trash, use strptime, even though it doesn't handle tzinfo
aStr=aStr.replace(',','')
dW,d,mon,year,time,off = aStr.split(',')
def parseBody(aBody):
lines =aBody
realBody = ''
REGEX = re.compile("boundary='(\w+)")
if lines[:2] == '--':
index = lines.find('\n')
if index <= -1:
return
delim = lines[:index]
lines = lines[index+1:]
if lines.find('Content-Type:') ==0: #first char on line
if lines.find('multipart') >-1 or lines.find('alternative') > -1:
'''
_boundaryRegex = REGEX.search(lines)
if _boundaryRegex:
_boundary = _boundaryRegex.group(1) #the text portion is bound by the _boundary
try:
start = lines.index(_boundary)
end = lines.rindex(_boundary)
print lines[start:end]
except Exception as ex:
print ex
else:
print lines
raise Exception('lalala')
#multipart messages should be handled...
'''
return None
lines= lines[lines.find('\n')+1:]
if lines.find('Content-Transfer-Encoding') ==0:
lines= lines[lines.find('\n')+1:]
realBody=lines[:lines.find(delim)]
realBody = realBody.replace('=\n','')
return realBody.strip()
def parseMessage(m):
sender = m.get('From')
senderParsed = sender[:sender.find('at students.olin.edu') -1]
date = m.get('Date')
subject = m.get('Subject')
subject = subject.replace('"',"'")
mID = m.get('Message-ID')
body = m.get_payload()
body = body.replace('"',"'")
body = parseBody(body)
return (mID,date,sender,subject,body)
def writeJunk(mBox,junk, aDict =dict()):
parsed =0
failed =0
for val in mBox.iterkeys():
m= mBox.get_message(val)
mID,date,sender,subject,body = parseMessage(m)
if body ==None:
failed +=1
continue
junk.write('#-----------------------------#\r\n')
junk.write("FROM:\t" + sender + "\r\n")
junk.write("SUBJECT:\t" + subject + "\r\n")
junk.write("DATE:\t" + date + "\r\n")
junk.write(body + "\r\n")
junk.write('#-----------------------------#\n')
parsed+=1
aDict[sender] = aDict.get(sender,0) + 1
return (parsed,failed,aDict)
if __name__ == '__main__':
if len(sys.argv) !=2:
print __doc__
sys.exit()
rootDir = sys.argv[1]
outFileName = os.path.join(rootDir,'output.txt')
print 'Writing output to %s' %outFileName
fl = open(outFileName,'w')
aDict = dict()
for root, dirs, files in os.walk(rootDir):
for fil in files:
if(fil.find('.mbox') > -1):
print fil
myBox = mailbox.mbox(os.path.join(rootDir,fil))
writeJunk(myBox,fl,aDict)
@corydolphin
Copy link
Author

Hey @OlinSLAC we can perhaps salvage some of this for indexing email archives.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment