subpath · April 1, 2019 04:35
diff --git a/data_web_scraping.py b/data_web_scraping.py
 """Save pdf files from lab website."""

 from bs4 import BeautifulSoup
 from bs4.dammit import EncodingDetector
 import requests
 from tqdm import tqdm

 # connect to the webpage
 url = 'https://forwardgro.com/testresults/'
 resp = requests.get(url)
 http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
 html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
 encoding = html_encoding or http_encoding
 soup = BeautifulSoup(resp.content, from_encoding=encoding, features="html.parser")

 # extract all links from the webpage
 links = soup.find_all('a', href=True)
 # filter only links to pdf files
 links = [link['href']for link in links if link['href'].endswith('.pdf')]

 # save pdf files localy into pdf folder
 count = 0
 for link in tqdm(links, desc='Downloading'):
    response = requests.get(link)
    if response.status_code == 200:
        count += 1
        with open('pdf/{}'.format(link.split('/')[-1]), 'wb') as f:
            f.write(response.content)

 print('Downloaded {} of {} files'.format(count, len(links)))
	"""Save pdf files from lab website."""

	from bs4 import BeautifulSoup
	from bs4.dammit import EncodingDetector
	import requests
	from tqdm import tqdm

	# connect to the webpage
	url = 'https://forwardgro.com/testresults/'
	resp = requests.get(url)
	http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
	html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
	encoding = html_encoding or http_encoding
	soup = BeautifulSoup(resp.content, from_encoding=encoding, features="html.parser")

	# extract all links from the webpage
	links = soup.find_all('a', href=True)
	# filter only links to pdf files
	links = [link['href']for link in links if link['href'].endswith('.pdf')]

	# save pdf files localy into pdf folder
	count = 0
	for link in tqdm(links, desc='Downloading'):
	response = requests.get(link)
	if response.status_code == 200:
	count += 1
	with open('pdf/{}'.format(link.split('/')[-1]), 'wb') as f:
	f.write(response.content)

	print('Downloaded {} of {} files'.format(count, len(links)))
No results found