Created
October 9, 2012 17:10
-
-
Save soeirosantos/3860096 to your computer and use it in GitHub Desktop.
simple script for download pdfs from a specific page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
simple script for download pdfs from a specific page | |
depends of BeautifulSoup http://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
""" | |
import urllib2 | |
import urllib | |
from bs4 import BeautifulSoup | |
#url from where are the pdfs | |
path = 'http://some/interesting/place/with/pdfs' | |
#specific page, if there is one | |
page = 'somepage.html' | |
page_path = path + "/" + page | |
all_links = BeautifulSoup("".join(urllib2.urlopen(page_path).readlines())).find_all("a") | |
def getFileName(href): | |
return href.split("/")[-1] | |
def completeUrl(href, path): | |
if "http://" not in href: | |
return path + "/" + href | |
else: | |
return href | |
for link in all_links: | |
href = link.get("href") | |
if ".pdf" in href: | |
print "urllib.urlretrieve(completeUrl(href, path), getFileName(href))" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment