Created
November 17, 2022 09:57
-
-
Save bsodhi/9725cf269aa55d9b6d6f3c31e5bd8cd3 to your computer and use it in GitHub Desktop.
Scaping PDF files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Written with a lot of help from StackOverflow community | |
and Python API documentation -- greatly appreciated! | |
""" | |
import sys | |
import requests | |
import urllib3 | |
import concurrent.futures | |
from bs4 import BeautifulSoup | |
from datetime import datetime as DT | |
from pathlib import Path | |
TPE = concurrent.futures.ThreadPoolExecutor(max_workers=6) | |
# Disable the SSL warnings | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
FIXED_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0" | |
DEBUG = False | |
HTTP_TIMEOUT_SEC = 5 | |
HTTP_DELAY_SEC = 2 | |
def log(msg): | |
ts = DT.now().strftime("%Y-%m-%d@%I:%M:%S%p") | |
print("[{0}] : {1}".format(ts, msg)) | |
def debug(msg): | |
if DEBUG: | |
log(msg) | |
def make_http_request(url): | |
log("Requesting URL {0}. Delay {1}s".format(url, HTTP_DELAY_SEC)) | |
return requests.get(url, | |
headers={'User-Agent': FIXED_UA}, | |
timeout=HTTP_TIMEOUT_SEC) | |
def download_file(url, file_path): | |
if file_path.exists(): | |
print("*** File {} already exists. Skipping.".format(file_path)) | |
return | |
# open in binary mode | |
with open(file_path, "wb") as file: | |
# get request | |
response = make_http_request(url) | |
# write to file | |
file.write(response.content) | |
print("Downloaded file {0} from {1}".format(url, file_path)) | |
def make_page_soup(page_url): | |
page = make_http_request(page_url) | |
if page.status_code == requests.codes.ok: | |
return BeautifulSoup(page.content, 'lxml') | |
else: | |
log("Failed to get page at URL {0}. Error: {1}".format( | |
page_url, page.reason)) | |
def main(pg_url, out_dir): | |
try: | |
file_exts = (".pdf", ".doc", ".epub", ".docx") | |
soup = make_page_soup(pg_url) | |
for link in soup.find_all('a'): | |
url = str(link.get('href')) | |
name = link.text.strip().replace(" ", "_") | |
fp = Path(out_dir, name) | |
if url.lower().endswith(file_exts): | |
TPE.submit(download_file, url, fp) | |
except Exception as ex: | |
log("**** Error "+str(ex)) | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: {0} INPUT_URL OUTPUT_DIR_PATH".format( | |
sys.argv[0])) | |
else: | |
main(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment