Last active
May 22, 2019 00:02
-
-
Save kizernis/47256b3abc42663719d11ae97cbc75c6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download over half a million HTML pages | |
# This script can be run multiple times simultaneously to increase download speed | |
import requests | |
import os | |
import portalocker | |
from bs4 import BeautifulSoup | |
import re | |
# import time | |
def is_locked(file_path): | |
is_locked = False | |
with open(file_path) as f: | |
try: | |
f.read(1) | |
except PermissionError: | |
is_locked = True | |
return is_locked | |
session = requests.Session() | |
session.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', | |
'Accept-Encoding': 'gzip, deflate'} | |
min_file_size = 2000 | |
label_pattern = re.compile('^lbl') | |
with open('../2_extract_personal_urls/output.txt') as f: | |
ids = f.read().splitlines() | |
for i, id in enumerate(ids, start=1): | |
url = f'http://www.cadutigrandeguerra.it/DettagliNominativi.aspx?id={id}' | |
page_file_path = f'output/{i:06d}.html' | |
if not os.path.isfile(page_file_path) or min_file_size > os.stat(page_file_path).st_size and not is_locked(page_file_path): | |
with open(page_file_path, 'w', encoding='utf-8') as f: | |
portalocker.lock(f, portalocker.LOCK_EX) | |
while True: | |
html = session.get(url).text | |
if len(html) > min_file_size: | |
soup_rows = BeautifulSoup(html, 'lxml').find_all('span', id=label_pattern) | |
if soup_rows is not None and len(soup_rows) == 18: | |
f.write(html) | |
# time.sleep(5) | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment