Created
October 27, 2020 05:52
-
-
Save yeiichi/d3b34fe376f7532cc79e76327f2ea42a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| from urllib.parse import urljoin | |
| from pathlib import Path | |
| from pprint import pprint | |
| import time | |
| # User agent definition: | |
| # You can check your User Agent at ifconfig.me | |
| UA_LIST = { | |
| 'SAFARI': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \AppleWebKit/\ | |
| 605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15', | |
| 'FIREFOX': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) Gecko/\ | |
| 20100101 Firefox/76.0', | |
| 'CHROME': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/\ | |
| 537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' | |
| } | |
| def fetch_url_pdf(target_url, user_agent='SAFARI'): | |
| """Fetches names and URLs of pdf files in the target_url | |
| and returns an array containing pdf names and urls """ | |
| my_headers = {'user-agent': UA_LIST[user_agent.upper()]} | |
| # Download and parse | |
| try: | |
| res = requests.get(target_url, headers=my_headers, timeout=(3.05, 27)) | |
| res.encoding = res.apparent_encoding | |
| except Exception as exc: | |
| print(f'Error: {exc}') | |
| soup = BeautifulSoup(res.text, 'lxml') | |
| # Extract page title | |
| page_title = soup.select('title')[0].text | |
| # Select a elements containing URL | |
| elements_url = soup.select('a') | |
| url_array = [] | |
| for i in elements_url: | |
| pdf_title, pdf_url = i.text, i.get('href') | |
| # Filter for PDF | |
| if Path(str(pdf_url)).suffix == '.pdf': | |
| url_array.append([pdf_title, urljoin(target_url, pdf_url)]) | |
| return url_array | |
| def pdf_downloader(url_array, dl_dir, user_agent='SAFARI'): | |
| """BLAH""" | |
| tfn = len(url_array) # Total pdf File Number | |
| dig = len(str(tfn)) # Number of digits of tfn | |
| my_headers = {'user-agent': UA_LIST[user_agent.upper()]} | |
| # Download | |
| for i, j in enumerate(url_array): | |
| base_name, file_url = j[0], j[1] | |
| try: | |
| res = requests.get(file_url, headers=my_headers, timeout=(3.05, 27)) | |
| with open(os.path.join(dl_dir, (base_name+'.pdf')), 'wb') as f: | |
| f.write(res.content) | |
| print(str(i+1).zfill(dig)+'/'+str(tfn).zfill(dig), | |
| base_name, 'URL: ', file_url) | |
| except Exception as exc: | |
| print(f'Error: {exc}') | |
| print('Waiting...') | |
| time.sleep(5) | |
| print('DONE!') | |
| if __name__ == '__main__': | |
| target_url = input('Target URL? >> ') | |
| pprint(fetch_url_pdf(target_url)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment