Skip to content

Instantly share code, notes, and snippets.

@subpath
Created April 1, 2019 04:35
Show Gist options
  • Select an option

  • Save subpath/8d67ec23626c930dc88079f5dcbc32f7 to your computer and use it in GitHub Desktop.

Select an option

Save subpath/8d67ec23626c930dc88079f5dcbc32f7 to your computer and use it in GitHub Desktop.
"""Save pdf files from lab website."""
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
from tqdm import tqdm
# connect to the webpage
url = 'https://forwardgro.com/testresults/'
resp = requests.get(url)
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, from_encoding=encoding, features="html.parser")
# extract all links from the webpage
links = soup.find_all('a', href=True)
# filter only links to pdf files
links = [link['href']for link in links if link['href'].endswith('.pdf')]
# save pdf files localy into pdf folder
count = 0
for link in tqdm(links, desc='Downloading'):
response = requests.get(link)
if response.status_code == 200:
count += 1
with open('pdf/{}'.format(link.split('/')[-1]), 'wb') as f:
f.write(response.content)
print('Downloaded {} of {} files'.format(count, len(links)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment