Created
May 31, 2017 23:17
-
-
Save itsonlybarney/1dd5494332d351dd7fd749a82b9e3dd7 to your computer and use it in GitHub Desktop.
Will download all links that are considered to be a PDF based on a .pdf file extension in the URL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Download all the pdfs linked on a given webpage | |
Usage - | |
python grab_pdfs.py url <path/to/directory> | |
url is required | |
path is optional. Path needs to be absolute | |
will save in the current directory if no path is given | |
will save in the current directory if given path does not exist | |
Requires - requests >= 1.0.4 | |
beautifulsoup >= 4.0.0 | |
Download and install using | |
pip install requests | |
pip install beautifulsoup4 | |
""" | |
__author__ = 'elssar <[email protected]>' | |
__license__ = 'MIT' | |
__version__ = '1.0.0' | |
from requests import get | |
from urllib.parse import urljoin | |
# from urlparse import urljoin | |
from os import path, getcwd | |
from bs4 import BeautifulSoup as soup | |
from sys import argv | |
def get_page(base_url): | |
req = get(base_url) | |
if req.status_code == 200: | |
return req.text | |
raise Exception('Error {0}'.format(req.status_code)) | |
def get_all_links(html): | |
bs = soup(html, 'lxml') | |
links = bs.findAll('a') | |
return links | |
def get_pdf(base_url, base_dir): | |
html = get_page(base_url) | |
links = get_all_links(html) | |
if len(links) == 0: | |
raise Exception('No links found on the webpage') | |
n_pdfs = 0 | |
for link in links: | |
if link['href'][-4:] == '.pdf': | |
n_pdfs += 1 | |
content = get(urljoin(base_url, link['href'])) | |
url_path, filename = path.split(urljoin(base_url, link['href'])) | |
if content.status_code == 200 and content.headers['content-type'] == 'application/pdf': | |
with open(path.join(base_dir, link.text + '.pdf'), 'wb') as pdf: | |
pdf.write(content.content) | |
if n_pdfs == 0: | |
raise Exception('No pdfs found on the page') | |
print('{} pdfs downloaded and saved in {}'.format(n_pdfs, base_dir)) | |
if __name__ == '__main__': | |
if len(argv) not in (2, 3): | |
print('Error! Invalid arguments') | |
print(__doc__) | |
exit(-1) | |
arg = '' | |
url = argv[1] | |
if len(argv) == 3: | |
arg = argv[2] | |
base_dir = [getcwd(), arg][path.isdir(arg)] | |
try: | |
get_pdf(url, base_dir) | |
except Exception as e: | |
print(e) | |
exit(-1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment