Skip to content

Instantly share code, notes, and snippets.

@kizernis
Last active May 22, 2019 00:02
Show Gist options
  • Save kizernis/47256b3abc42663719d11ae97cbc75c6 to your computer and use it in GitHub Desktop.
Save kizernis/47256b3abc42663719d11ae97cbc75c6 to your computer and use it in GitHub Desktop.
# Download over half a million HTML pages
# This script can be run multiple times simultaneously to increase download speed
import requests
import os
import portalocker
from bs4 import BeautifulSoup
import re
# import time
def is_locked(file_path):
is_locked = False
with open(file_path) as f:
try:
f.read(1)
except PermissionError:
is_locked = True
return is_locked
session = requests.Session()
session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Accept-Encoding': 'gzip, deflate'}
min_file_size = 2000
label_pattern = re.compile('^lbl')
with open('../2_extract_personal_urls/output.txt') as f:
ids = f.read().splitlines()
for i, id in enumerate(ids, start=1):
url = f'http://www.cadutigrandeguerra.it/DettagliNominativi.aspx?id={id}'
page_file_path = f'output/{i:06d}.html'
if not os.path.isfile(page_file_path) or min_file_size > os.stat(page_file_path).st_size and not is_locked(page_file_path):
with open(page_file_path, 'w', encoding='utf-8') as f:
portalocker.lock(f, portalocker.LOCK_EX)
while True:
html = session.get(url).text
if len(html) > min_file_size:
soup_rows = BeautifulSoup(html, 'lxml').find_all('span', id=label_pattern)
if soup_rows is not None and len(soup_rows) == 18:
f.write(html)
# time.sleep(5)
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment