Created
May 16, 2015 20:26
-
-
Save msaroufim/abcec62152b77dbbadb3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A simple command line utility that takes in a list of target url's | |
and downloads all pdf links contained on the page | |
Potentially look into using scrapy to build a full fledged crawler instead | |
""" | |
import requests | |
import sys | |
import re | |
import subprocess | |
from bs4 import BeautifulSoup | |
import urlparse | |
def process(target_url): | |
page = requests.get(target_url) | |
print "Succesfully crawled %s" %(target_url) | |
print "Searching for pdf links" | |
soup = BeautifulSoup(page.text) | |
#print soup | |
links = soup.findAll(href=re.compile("\.pdf$")) | |
for link in links: | |
#if pdf is locally linked | |
if not link.get('href').startswith("http"): | |
link = urlparse.urljoin(target_url,link.get('href')) | |
#if pdf link is absolute | |
else: | |
link = link.get('href') | |
print "Downloading: %s" %(link) | |
#Create pdf files | |
subprocess.call(["curl", "-O",link],shell=False) | |
print "Created: %s" %(link) | |
def main(): | |
if len(sys.argv) == 1: | |
print "Usage python get-all-pdf-on-page.py target_url_1 target_url_2 ..." | |
sys.exit(-1) | |
for target_url in sys.argv[1:]: | |
process(target_url) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment