corydolphin · February 3, 2012 07:00 · corydolphin · Feb 3, 2012
diff --git a/parseEmail.py b/parseEmail.py
 #!/usr/bin/env python
 """
 parseEmail.py: Parses mbox-formatted email boxes for the interesting bits
 Usage:   ./parseEmail.py  dir
 Where dir is a directory containing mbox files
 """

 import mailbox
 import os
 import sys
 import re
 def parseDate(aStr):

    #'Tue, 31 Aug 2010 20:53:12 -0400'
    #this is trash, use strptime, even though it doesn't handle tzinfo
    aStr=aStr.replace(',','')
    dW,d,mon,year,time,off = aStr.split(',')

 def parseBody(aBody):
    lines =aBody
    realBody = ''
    REGEX = re.compile("boundary='(\w+)")   


    if lines[:2] == '--':
        index = lines.find('\n')
        if index <= -1:
            return
        delim = lines[:index]
        lines = lines[index+1:]
        if lines.find('Content-Type:') ==0: #first char on line
            if lines.find('multipart') >-1 or lines.find('alternative') > -1:

                '''
                    _boundaryRegex = REGEX.search(lines)
                    if _boundaryRegex:                
                        _boundary = _boundaryRegex.group(1) #the text portion is bound by the _boundary
                        try:
                            start = lines.index(_boundary)
                            end = lines.rindex(_boundary)
                            print lines[start:end]
                        except Exception as ex:
                            print ex
                    else:
                        print lines
                        raise Exception('lalala')
                    #multipart messages should be handled...
                '''                
                return None
            lines= lines[lines.find('\n')+1:]
        if lines.find('Content-Transfer-Encoding') ==0:
            lines= lines[lines.find('\n')+1:]
        realBody=lines[:lines.find(delim)]
    realBody = realBody.replace('=\n','')
    return realBody.strip()


 def parseMessage(m):
    sender = m.get('From')
    senderParsed = sender[:sender.find('at students.olin.edu') -1]
    date = m.get('Date')
    subject = m.get('Subject')
    subject = subject.replace('"',"'")
    mID = m.get('Message-ID')
    body = m.get_payload()
    body = body.replace('"',"'")


    body = parseBody(body)
    return (mID,date,sender,subject,body)

 def writeJunk(mBox,junk, aDict =dict()):
    parsed =0
    failed =0
    for val in mBox.iterkeys():
        m= mBox.get_message(val)
        mID,date,sender,subject,body = parseMessage(m)
        if body ==None:
            failed +=1
            continue
        junk.write('#-----------------------------#\r\n')
        junk.write("FROM:\t" + sender + "\r\n")
        junk.write("SUBJECT:\t" + subject + "\r\n")
        junk.write("DATE:\t" + date + "\r\n")
        junk.write(body + "\r\n")
        junk.write('#-----------------------------#\n')

        parsed+=1
        aDict[sender] = aDict.get(sender,0) + 1
    return (parsed,failed,aDict)


 if __name__ == '__main__':
    if len(sys.argv) !=2:
        print __doc__
        sys.exit()

    rootDir = sys.argv[1]

    outFileName = os.path.join(rootDir,'output.txt')
    print 'Writing output to %s' %outFileName
    fl = open(outFileName,'w')

    aDict = dict()
    for root, dirs, files in os.walk(rootDir):
        for fil in files:
            if(fil.find('.mbox') > -1):
                print fil
                myBox = mailbox.mbox(os.path.join(rootDir,fil))
                writeJunk(myBox,fl,aDict)
	#!/usr/bin/env python
	"""
	parseEmail.py: Parses mbox-formatted email boxes for the interesting bits
	Usage: ./parseEmail.py dir
	Where dir is a directory containing mbox files
	"""

	import mailbox
	import os
	import sys
	import re
	def parseDate(aStr):

	#'Tue, 31 Aug 2010 20:53:12 -0400'
	#this is trash, use strptime, even though it doesn't handle tzinfo
	aStr=aStr.replace(',','')
	dW,d,mon,year,time,off = aStr.split(',')

	def parseBody(aBody):
	lines =aBody
	realBody = ''
	REGEX = re.compile("boundary='(\w+)")


	if lines[:2] == '--':
	index = lines.find('\n')
	if index <= -1:
	return
	delim = lines[:index]
	lines = lines[index+1:]
	if lines.find('Content-Type:') ==0: #first char on line
	if lines.find('multipart') >-1 or lines.find('alternative') > -1:

	'''
	_boundaryRegex = REGEX.search(lines)
	if _boundaryRegex:
	_boundary = _boundaryRegex.group(1) #the text portion is bound by the _boundary
	try:
	start = lines.index(_boundary)
	end = lines.rindex(_boundary)
	print lines[start:end]
	except Exception as ex:
	print ex
	else:
	print lines
	raise Exception('lalala')
	#multipart messages should be handled...
	'''
	return None
	lines= lines[lines.find('\n')+1:]
	if lines.find('Content-Transfer-Encoding') ==0:
	lines= lines[lines.find('\n')+1:]
	realBody=lines[:lines.find(delim)]
	realBody = realBody.replace('=\n','')
	return realBody.strip()


	def parseMessage(m):
	sender = m.get('From')
	senderParsed = sender[:sender.find('at students.olin.edu') -1]
	date = m.get('Date')
	subject = m.get('Subject')
	subject = subject.replace('"',"'")
	mID = m.get('Message-ID')
	body = m.get_payload()
	body = body.replace('"',"'")


	body = parseBody(body)
	return (mID,date,sender,subject,body)

	def writeJunk(mBox,junk, aDict =dict()):
	parsed =0
	failed =0
	for val in mBox.iterkeys():
	m= mBox.get_message(val)
	mID,date,sender,subject,body = parseMessage(m)
	if body ==None:
	failed +=1
	continue
	junk.write('#-----------------------------#\r\n')
	junk.write("FROM:\t" + sender + "\r\n")
	junk.write("SUBJECT:\t" + subject + "\r\n")
	junk.write("DATE:\t" + date + "\r\n")
	junk.write(body + "\r\n")
	junk.write('#-----------------------------#\n')

	parsed+=1
	aDict[sender] = aDict.get(sender,0) + 1
	return (parsed,failed,aDict)


	if __name__ == '__main__':
	if len(sys.argv) !=2:
	print __doc__
	sys.exit()

	rootDir = sys.argv[1]

	outFileName = os.path.join(rootDir,'output.txt')
	print 'Writing output to %s' %outFileName
	fl = open(outFileName,'w')

	aDict = dict()
	for root, dirs, files in os.walk(rootDir):
	for fil in files:
	if(fil.find('.mbox') > -1):
	print fil
	myBox = mailbox.mbox(os.path.join(rootDir,fil))
	writeJunk(myBox,fl,aDict)
No results found