kizernis · May 22, 2019 00:02
diff --git a/mass_download.py b/mass_download.py
 # Download over half a million HTML pages
 # This script can be run multiple times simultaneously to increase download speed

 import requests
 import os
 import portalocker
 from bs4 import BeautifulSoup
 import re
 # import time

 def is_locked(file_path):
    is_locked = False
    with open(file_path) as f:
        try:
            f.read(1)
        except PermissionError:
            is_locked = True
    return is_locked

 session = requests.Session()
 session.headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
    'Accept-Encoding': 'gzip, deflate'}

 min_file_size = 2000
 label_pattern = re.compile('^lbl')

 with open('../2_extract_personal_urls/output.txt') as f:
    ids = f.read().splitlines()

 for i, id in enumerate(ids, start=1):
    url = f'http://www.cadutigrandeguerra.it/DettagliNominativi.aspx?id={id}'
    page_file_path = f'output/{i:06d}.html'

    if not os.path.isfile(page_file_path) or min_file_size > os.stat(page_file_path).st_size and not is_locked(page_file_path):
        with open(page_file_path, 'w', encoding='utf-8') as f:
            portalocker.lock(f, portalocker.LOCK_EX)
            while True:
                html = session.get(url).text
                if len(html) > min_file_size:
                    soup_rows = BeautifulSoup(html, 'lxml').find_all('span', id=label_pattern)
                    if soup_rows is not None and len(soup_rows) == 18:
                        f.write(html)
                        # time.sleep(5)
                        break
	# Download over half a million HTML pages
	# This script can be run multiple times simultaneously to increase download speed

	import requests
	import os
	import portalocker
	from bs4 import BeautifulSoup
	import re
	# import time

	def is_locked(file_path):
	is_locked = False
	with open(file_path) as f:
	try:
	f.read(1)
	except PermissionError:
	is_locked = True
	return is_locked

	session = requests.Session()
	session.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
	'Accept-Encoding': 'gzip, deflate'}

	min_file_size = 2000
	label_pattern = re.compile('^lbl')

	with open('../2_extract_personal_urls/output.txt') as f:
	ids = f.read().splitlines()

	for i, id in enumerate(ids, start=1):
	url = f'http://www.cadutigrandeguerra.it/DettagliNominativi.aspx?id={id}'
	page_file_path = f'output/{i:06d}.html'

	if not os.path.isfile(page_file_path) or min_file_size > os.stat(page_file_path).st_size and not is_locked(page_file_path):
	with open(page_file_path, 'w', encoding='utf-8') as f:
	portalocker.lock(f, portalocker.LOCK_EX)
	while True:
	html = session.get(url).text
	if len(html) > min_file_size:
	soup_rows = BeautifulSoup(html, 'lxml').find_all('span', id=label_pattern)
	if soup_rows is not None and len(soup_rows) == 18:
	f.write(html)
	# time.sleep(5)
	break