Skip to content

Instantly share code, notes, and snippets.

@corydolphin
Created February 16, 2012 02:53
Show Gist options
  • Save corydolphin/1841235 to your computer and use it in GitHub Desktop.
Save corydolphin/1841235 to your computer and use it in GitHub Desktop.
Parses a Mailman Archive site and decodes all of the gzipped text archives in mbox format for reading with most popular
"""
downloadArchives.py: Parses a Mailman Archive site and decodes all of the
gzipped text archives in mbox format for reading with most popular
email services, or mailbox.py
Usage: ./downloadArchives.py [dir] [-f]
Where dir is a directory containing mbox files
If dir is not specified, the output will default to the current working directory
Optionally, the -f flag will overwrite any existing files of the same name as the archives
"""
import mechanize
import cookielib
import os
from BeautifulSoup import BeautifulSoup
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
def authenticate():
'''
We first need to authetnicate with mailman and let them give us a cookie
that way they know they can trust us!
'''
r = br.open('https://lists.olin.edu/mailman/private/helpme/')
html = r.read()
br.select_form(nr=0)
br.form['username'] = ''#email
br.form['password'] = ''#some funny password
# Show the source
br.submit()
def getArchiveUrls():
'''
Returns a generator of absolute urls for each of the archives
'''
r = br.open('https://lists.olin.edu/mailman/private/helpme/')
html = r.read()
soup = BeautifulSoup(html)
for row in soup.findAll('tr'):
try:
archiveSource = row.findAll('td')[2].find('a')['href'] #relative URL
absoluteURL = 'https://lists.olin.edu/mailman/private/helpme/%s'%archiveSource #absolute
yield absoluteURL
except Exception as inst:
print inst
def downloadAndDecodeArchive(url,rootDir='',overWrite=False):
archiveName = url[url.rfind('/')+1:]
start = True
outFileName = archiveName.replace('txt.gz','mbox')
if not os.path.os.path.isdir(rootDir):
if not os.path.os.path.exists(rootDir):
try:
print 'The directory you specified does not exist, attempting to create it'
os.mkdir(rootDir)
except Exception as inst:
print inst
outFileName = os.path.join(rootDir,outFileName)
if not overWrite and os.path.exists(outFileName): #do not overwrite if already downloaded, by default
return False
out = open(outFileName,'w')
for line in br.open(url):
if line.find("From ") == 0:
if start:
start = False
else:
out.write("\n")
line = line.replace(" at ", "@")
elif line.find("Message-ID: ") == 0:
messageid_stripped = line[line.find('<')+1:line.rfind('>')]
messageid_stripped = messageid_stripped.replace('@','')
messageid_stripped = messageid_stripped.replace('.','')
line = line + "Content-Type: multipart/mixed;boundary=_000_" + messageid_stripped + "_\n"
out.write(line)
return outFileName
def toMbox(lines):
start = True
for line in lines:
if line.find("From ") == 0:
if start:
start = False
else:
out.write("\n")
line = line.replace(" at ", "@")
elif line.find("Message-ID: ") == 0:
messageid_stripped = line[line.find('<')+1:line.rfind('>')]
messageid_stripped = messageid_stripped.replace('@','')
messageid_stripped = messageid_stripped.replace('.','')
line = line + "Content-Type: multipart/mixed;boundary=_000_" + messageid_stripped + "_\n"
return lines
if __name__ == '__main__':
import sys
OVERWRITE=False
if len(sys.argv) >3:
print __doc__
sys.exit()
if len(sys.argv) ==3:
if sys.argv[2].lower() == '-f':
OVERWRITE=True
else:
print __doc__
sys.exit()
if len(sys.argv) >= 2:
rootDir = sys.argv[1]
else:
rootDir = '.'
authenticate()
for url in getArchiveUrls():
print url
downloadAndDecodeArchive(url,rootDir=rootDir,overWrite=OVERWRITE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment