-
-
Save walchko/471ea0a522dbe885f367a675c3097b4e to your computer and use it in GitHub Desktop.
Download All PDFs in a URL using Python mechanize
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is kind-of based off of this: http://stackoverflow.com/questions/5974595/download-all-the-linksrelated-documents-on-a-webpage-using-python | |
import cookielib | |
import urllib2 | |
import mechanize | |
from time import sleep | |
import os | |
import cgi | |
# A routine to download a file from a link, by simulating a click on it | |
def downloadlink(linkUrl, referer): | |
r = br.click_link(linkUrl) | |
r.add_header("Referer", referer) # add a referer header, just in case | |
response = br.open(r) | |
#get filename from the response headers if possible | |
cdheader = response.info().getheader('Content-Disposition') | |
if cdheader: | |
value, params = cgi.parse_header(cdheader) | |
filename = params["filename"] | |
else: | |
# if not use the link's basename | |
filename = os.path.basename(linkUrl.url) | |
f = open(filename, "w") #TODO: perhaps ensure that file doesn't already exist? | |
f.write(response.read()) # write the response content to disk | |
print filename," has been downloaded" | |
br.back() | |
# Make a Browser (think of this as chrome or firefox etc) | |
br = mechanize.Browser() | |
# Enable cookie support for urllib2 | |
cookiejar = cookielib.LWPCookieJar() | |
br.set_cookiejar( cookiejar ) | |
# Broser options | |
br.set_handle_equiv( True ) | |
br.set_handle_gzip( True ) | |
br.set_handle_redirect( True ) | |
br.set_handle_referer( True ) | |
br.set_handle_robots( False ) | |
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 ) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # masquerade as a real browser. this is not nice to do though. | |
# Open your site | |
mypageUrl = 'http://my.url.com/page' | |
br.open(mypageUrl) | |
print "Get all PDF links\n" | |
filetypes=["pdf", "PDF"] # pattern matching for links, can add more kinds here | |
myfiles=[] | |
for l in br.links(): | |
#check if this link has the file extension or text we want | |
myfiles.extend([l for t in filetypes if t in l.url or t in l.text]) | |
for l in myfiles: | |
# for index, l in zip(range(100), myfiles): # <--- uncomment this line (and coment the one above) to download 100 links. | |
#sleep(1) # uncomment to throttle downloads, so you dont hammer the site | |
downloadlink(l, mypageUrl) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment