Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Created October 27, 2020 05:52
Show Gist options
  • Select an option

  • Save yeiichi/d3b34fe376f7532cc79e76327f2ea42a to your computer and use it in GitHub Desktop.

Select an option

Save yeiichi/d3b34fe376f7532cc79e76327f2ea42a to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
from pathlib import Path
from pprint import pprint
import time
# User agent definition:
# You can check your User Agent at ifconfig.me
UA_LIST = {
'SAFARI': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \AppleWebKit/\
605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15',
'FIREFOX': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) Gecko/\
20100101 Firefox/76.0',
'CHROME': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/\
537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
def fetch_url_pdf(target_url, user_agent='SAFARI'):
"""Fetches names and URLs of pdf files in the target_url
and returns an array containing pdf names and urls """
my_headers = {'user-agent': UA_LIST[user_agent.upper()]}
# Download and parse
try:
res = requests.get(target_url, headers=my_headers, timeout=(3.05, 27))
res.encoding = res.apparent_encoding
except Exception as exc:
print(f'Error: {exc}')
soup = BeautifulSoup(res.text, 'lxml')
# Extract page title
page_title = soup.select('title')[0].text
# Select a elements containing URL
elements_url = soup.select('a')
url_array = []
for i in elements_url:
pdf_title, pdf_url = i.text, i.get('href')
# Filter for PDF
if Path(str(pdf_url)).suffix == '.pdf':
url_array.append([pdf_title, urljoin(target_url, pdf_url)])
return url_array
def pdf_downloader(url_array, dl_dir, user_agent='SAFARI'):
"""BLAH"""
tfn = len(url_array) # Total pdf File Number
dig = len(str(tfn)) # Number of digits of tfn
my_headers = {'user-agent': UA_LIST[user_agent.upper()]}
# Download
for i, j in enumerate(url_array):
base_name, file_url = j[0], j[1]
try:
res = requests.get(file_url, headers=my_headers, timeout=(3.05, 27))
with open(os.path.join(dl_dir, (base_name+'.pdf')), 'wb') as f:
f.write(res.content)
print(str(i+1).zfill(dig)+'/'+str(tfn).zfill(dig),
base_name, 'URL: ', file_url)
except Exception as exc:
print(f'Error: {exc}')
print('Waiting...')
time.sleep(5)
print('DONE!')
if __name__ == '__main__':
target_url = input('Target URL? >> ')
pprint(fetch_url_pdf(target_url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment