Created
February 16, 2012 02:53
-
-
Save corydolphin/1841235 to your computer and use it in GitHub Desktop.
Parses a Mailman Archive site and decodes all of the gzipped text archives in mbox format for reading with most popular
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
downloadArchives.py: Parses a Mailman Archive site and decodes all of the | |
gzipped text archives in mbox format for reading with most popular | |
email services, or mailbox.py | |
Usage: ./downloadArchives.py [dir] [-f] | |
Where dir is a directory containing mbox files | |
If dir is not specified, the output will default to the current working directory | |
Optionally, the -f flag will overwrite any existing files of the same name as the archives | |
""" | |
import mechanize | |
import cookielib | |
import os | |
from BeautifulSoup import BeautifulSoup | |
# Browser | |
br = mechanize.Browser() | |
# Cookie Jar | |
cj = cookielib.LWPCookieJar() | |
br.set_cookiejar(cj) | |
# Browser options | |
br.set_handle_equiv(True) | |
br.set_handle_gzip(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
# Follows refresh 0 but not hangs on refresh > 0 | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
# User-Agent (this is cheating, ok?) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
def authenticate(): | |
''' | |
We first need to authetnicate with mailman and let them give us a cookie | |
that way they know they can trust us! | |
''' | |
r = br.open('https://lists.olin.edu/mailman/private/helpme/') | |
html = r.read() | |
br.select_form(nr=0) | |
br.form['username'] = ''#email | |
br.form['password'] = ''#some funny password | |
# Show the source | |
br.submit() | |
def getArchiveUrls(): | |
''' | |
Returns a generator of absolute urls for each of the archives | |
''' | |
r = br.open('https://lists.olin.edu/mailman/private/helpme/') | |
html = r.read() | |
soup = BeautifulSoup(html) | |
for row in soup.findAll('tr'): | |
try: | |
archiveSource = row.findAll('td')[2].find('a')['href'] #relative URL | |
absoluteURL = 'https://lists.olin.edu/mailman/private/helpme/%s'%archiveSource #absolute | |
yield absoluteURL | |
except Exception as inst: | |
print inst | |
def downloadAndDecodeArchive(url,rootDir='',overWrite=False): | |
archiveName = url[url.rfind('/')+1:] | |
start = True | |
outFileName = archiveName.replace('txt.gz','mbox') | |
if not os.path.os.path.isdir(rootDir): | |
if not os.path.os.path.exists(rootDir): | |
try: | |
print 'The directory you specified does not exist, attempting to create it' | |
os.mkdir(rootDir) | |
except Exception as inst: | |
print inst | |
outFileName = os.path.join(rootDir,outFileName) | |
if not overWrite and os.path.exists(outFileName): #do not overwrite if already downloaded, by default | |
return False | |
out = open(outFileName,'w') | |
for line in br.open(url): | |
if line.find("From ") == 0: | |
if start: | |
start = False | |
else: | |
out.write("\n") | |
line = line.replace(" at ", "@") | |
elif line.find("Message-ID: ") == 0: | |
messageid_stripped = line[line.find('<')+1:line.rfind('>')] | |
messageid_stripped = messageid_stripped.replace('@','') | |
messageid_stripped = messageid_stripped.replace('.','') | |
line = line + "Content-Type: multipart/mixed;boundary=_000_" + messageid_stripped + "_\n" | |
out.write(line) | |
return outFileName | |
def toMbox(lines): | |
start = True | |
for line in lines: | |
if line.find("From ") == 0: | |
if start: | |
start = False | |
else: | |
out.write("\n") | |
line = line.replace(" at ", "@") | |
elif line.find("Message-ID: ") == 0: | |
messageid_stripped = line[line.find('<')+1:line.rfind('>')] | |
messageid_stripped = messageid_stripped.replace('@','') | |
messageid_stripped = messageid_stripped.replace('.','') | |
line = line + "Content-Type: multipart/mixed;boundary=_000_" + messageid_stripped + "_\n" | |
return lines | |
if __name__ == '__main__': | |
import sys | |
OVERWRITE=False | |
if len(sys.argv) >3: | |
print __doc__ | |
sys.exit() | |
if len(sys.argv) ==3: | |
if sys.argv[2].lower() == '-f': | |
OVERWRITE=True | |
else: | |
print __doc__ | |
sys.exit() | |
if len(sys.argv) >= 2: | |
rootDir = sys.argv[1] | |
else: | |
rootDir = '.' | |
authenticate() | |
for url in getArchiveUrls(): | |
print url | |
downloadAndDecodeArchive(url,rootDir=rootDir,overWrite=OVERWRITE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment