Last active
July 18, 2019 11:06
-
-
Save gipi/97f7c9813878777e50cf5a492f4bf29b to your computer and use it in GitHub Desktop.
Crea dei CSV con l'elenco degli indirizzi PEC dei vari comuni italiani. Dalla pagina principale recupera l'URL delle singole regioni da cui crea il corrispondente CSV.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from pathlib import Path | |
import os | |
import sys | |
import requests | |
import logging | |
logging.basicConfig() | |
logger = logging.getLogger() | |
logger.setLevel('INFO') | |
from requests_html import HTML | |
COLUMN_EMAIL = 3 | |
URL_ROOT_DOMAIN = 'http://www.comuniverso.it/' | |
URL_ROOT_PAGE = 'index.cfm?Comuni_con_PEC&menu=271' | |
DOWNLOADS_FOLDER = 'downloads' | |
DIR = os.path.dirname(__file__) | |
get_download_path = lambda x: Path(DIR)/DOWNLOADS_FOLDER/x | |
def download_and_save(url, filename): | |
'''download url and save as filename if filename doesn't exist''' | |
download_dir = Path(DIR)/DOWNLOADS_FOLDER | |
download_path = download_dir/filename | |
if not download_dir.exists(): | |
logger.info('creating directory \'%s\'' % download_dir) | |
download_dir.mkdir(parents=True) | |
if download_path.exists(): | |
return download_path | |
logger.info('\'%s\' doesn\'t exist, I\'ll download it using \'%s\'' % (download_path, url)) | |
response = requests.get(URL_ROOT_DOMAIN + url) | |
if response.status_code != 200: | |
raise Exception('connection to \'%s\' failed' % url) | |
with download_path.open("w") as f: | |
f.write(response.text) | |
return download_path | |
def usage(argname): | |
print("usage: %s" % argname) | |
sys.exit(1) | |
def csv_from_html(filepath): | |
ret = [] | |
with open(filepath) as f: | |
contents = f.read() | |
html = HTML(html=contents) | |
table = html.xpath('/html/body/a/table')[0] | |
tbody = table.find('tbody')[0] | |
for row in tbody.find('tr'): | |
line = [] | |
for idx, column in enumerate(row.find('td')): | |
if idx == COLUMN_EMAIL: | |
line.append(column.find('a')[0].attrs['href'][len('mailto:'):]) | |
else: | |
line.append(column.text) | |
ret.append(','.join(line)) | |
return ret | |
def parse_root_page(filename): | |
filepath = get_download_path(filename) | |
ret = [] | |
with filepath.open() as f: | |
contents = f.read() | |
html = HTML(html=contents) | |
for row in [_ for _ in html.find('table')[0].find('table')][2].find('tr'): | |
has_link = len(row.find('a')) > 0 | |
if not has_link: | |
continue | |
link = row.find('a')[0].attrs['href'] | |
regione = row.text.lower().split('\n')[0] # there are a couple of '\n' | |
ret.append((regione, link)) | |
return ret | |
def save_csv(url, name): | |
path = download_and_save(url, '%s.html' % name) | |
lines = csv_from_html(path) | |
with Path('%s.csv' % name).open("w") as f: | |
for line in lines: | |
f.write(line) | |
f.write('\n') | |
def retrieve_regions(): | |
download_and_save(URL_ROOT_PAGE, 'root.html') | |
return parse_root_page('root.html') | |
if __name__ == '__main__': | |
urls = retrieve_regions() | |
for region_name, url in urls: | |
save_csv(url, region_name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment