aladagemre · July 12, 2020 12:38
diff --git a/polen_crawler.py b/polen_crawler.py
 """
 This is a script for parsing polen levels in Madrid, Spain.
 It used to work in 2019 but you may need to make fixes to make it work in the following years.
 """
 import re
 import locale
 import time
 import logging

 import requests
 import pandas as pd

 from bs4 import BeautifulSoup
 import coloredlogs, logging
 coloredlogs.install()

 logger = logging.getLogger("Polen Crawler")

 locale.setlocale(locale.LC_TIME, "es_ES")

 def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield tuple(l[i:i + n])

 def format_name(name):
    return name.replace('Ã¡', 'á').replace('Ã', 'í')

 def process_chunk(chunk):
    type_ = format_name(chunk[0].strip())
    amount = chunk[1].strip()
    amount = re.sub(r'(?<=\d)[,\.](?=\d)','', amount)
    level, threshold = chunk[2].strip().split(" (")
    threshold = threshold.strip()[:-1]
    return (type_, amount, level, threshold)

 cities = {
    'Madrid-Arganzuela': 'X=262&Y=299',
    'Ciudad Universiteria': 'X=249&Y=282',
    'Las Rozas': 'X=199&Y=252',
    'Collado Villalba': 'X=171&Y=203',
    'Alcobendas': 'X=282&Y=242',
    'Salamanca': 'X=264&Y=279',
    'Coslada': 'X=304&Y=283',
    'Alcala de Henares': 'X=357&Y=262',
    'Getafe': 'X=252&Y=332',
    'Aranjuez': 'X=290&Y=441'
 }

 base = 'http://gestiona.madrid.org/geoserver/wms?SERVICE=WMS&VERSION=1.1.1&REQUEST=GetFeatureInfo&LAYERS=SPOL_V_CAPTADORES_GIS&QUERY_LAYERS=SPOL_V_CAPTADORES_GIS&STYLES=&BBOX=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528&FEATURE_COUNT=50&HEIGHT=493&WIDTH=450&FORMAT=image%2Fpng&INFO_FORMAT=text%2Fhtml&SRS=EPSG%3A23030&'

 columns = ['date', 'point', 'type', 'amount', 'level', 'threshold']
 try:
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
    df = pd.read_csv("polen.csv", parse_dates=['date'], date_parser=dateparse)
 except FileNotFoundError as ex:
    df = pd.DataFrame(columns=columns)

 failed = set()

 for city, arg in cities.items():
    logger.info(f"Fetching {city}...")
    html_doc = requests.get(base + arg).text
    logger.info(f"Parsing {city}...")
    soup = BeautifulSoup(html_doc, 'html.parser')
    results = [label.get_text() for label in soup.find_all('label')]
    if results:
        point = results[1].strip()
        date = results[3].strip()
        new_rows = pd.DataFrame([[date, point, *process_chunk(chunk)] for chunk in chunks(results[7::], 3)], columns=columns)
        new_rows.date = pd.to_datetime(new_rows.date, format='%d-%b-%Y')
        if not new_rows.empty:
            df = pd.concat([df, new_rows], sort=True)
            num_rows = new_rows.shape[0]
            logger.info(f"Appended {num_rows} rows to the records")
        else:
            failed.add(city)
    else:
        logger.error(f"Could not download {city}")
        failed.add(city)

    time.sleep(3)

 logger.info("Processing concatenated data...")
 df = df.drop_duplicates(subset=['date', 'point', 'type']).reset_index(drop=True)
 df = df.sort_values(by=['date', 'point', 'type'])
 df = df.reset_index(drop=True)
 df = df[columns]
 logger.info("Writing to csv file...")
 df.to_csv("polen.csv", index=False)
 logger.info("Finished!")
 if failed:
    cities = ", ".join(failed)
    logger.error("Following points failed: {cities}")



 # JSON: http http://gestiona.madrid.org/geoserver/wms\?SERVICE\=WMS\&VERSION\=1.1.1\&REQUEST\=GetFeatureInfo\&LAYERS\=SPOL_V_CAPTADORES_GIS\&QUERY_LAYERS\=SPOL_V_CAPTADORES_GIS\&STYLES\=\&BBOX\=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528\&FEATURE_COUNT\=50\&HEIGHT\=493\&WIDTH\=450\&FORMAT\=image%2Fpng\&INFO_FORMAT\=application%2Fjson\&SRS\=EPSG%3A23030\&X\=238\&Y\=325
	"""
	This is a script for parsing polen levels in Madrid, Spain.
	It used to work in 2019 but you may need to make fixes to make it work in the following years.
	"""
	import re
	import locale
	import time
	import logging

	import requests
	import pandas as pd

	from bs4 import BeautifulSoup
	import coloredlogs, logging
	coloredlogs.install()

	logger = logging.getLogger("Polen Crawler")

	locale.setlocale(locale.LC_TIME, "es_ES")

	def chunks(l, n):
	"""Yield successive n-sized chunks from l."""
	for i in range(0, len(l), n):
	yield tuple(l[i:i + n])

	def format_name(name):
	return name.replace('Ã¡', 'á').replace('Ã', 'í')

	def process_chunk(chunk):
	type_ = format_name(chunk[0].strip())
	amount = chunk[1].strip()
	amount = re.sub(r'(?<=\d)[,\.](?=\d)','', amount)
	level, threshold = chunk[2].strip().split(" (")
	threshold = threshold.strip()[:-1]
	return (type_, amount, level, threshold)

	cities = {
	'Madrid-Arganzuela': 'X=262&Y=299',
	'Ciudad Universiteria': 'X=249&Y=282',
	'Las Rozas': 'X=199&Y=252',
	'Collado Villalba': 'X=171&Y=203',
	'Alcobendas': 'X=282&Y=242',
	'Salamanca': 'X=264&Y=279',
	'Coslada': 'X=304&Y=283',
	'Alcala de Henares': 'X=357&Y=262',
	'Getafe': 'X=252&Y=332',
	'Aranjuez': 'X=290&Y=441'
	}

	base = 'http://gestiona.madrid.org/geoserver/wms?SERVICE=WMS&VERSION=1.1.1&REQUEST=GetFeatureInfo&LAYERS=SPOL_V_CAPTADORES_GIS&QUERY_LAYERS=SPOL_V_CAPTADORES_GIS&STYLES=&BBOX=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528&FEATURE_COUNT=50&HEIGHT=493&WIDTH=450&FORMAT=image%2Fpng&INFO_FORMAT=text%2Fhtml&SRS=EPSG%3A23030&'

	columns = ['date', 'point', 'type', 'amount', 'level', 'threshold']
	try:
	dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
	df = pd.read_csv("polen.csv", parse_dates=['date'], date_parser=dateparse)
	except FileNotFoundError as ex:
	df = pd.DataFrame(columns=columns)

	failed = set()

	for city, arg in cities.items():
	logger.info(f"Fetching {city}...")
	html_doc = requests.get(base + arg).text
	logger.info(f"Parsing {city}...")
	soup = BeautifulSoup(html_doc, 'html.parser')
	results = [label.get_text() for label in soup.find_all('label')]
	if results:
	point = results[1].strip()
	date = results[3].strip()
	new_rows = pd.DataFrame([[date, point, *process_chunk(chunk)] for chunk in chunks(results[7::], 3)], columns=columns)
	new_rows.date = pd.to_datetime(new_rows.date, format='%d-%b-%Y')
	if not new_rows.empty:
	df = pd.concat([df, new_rows], sort=True)
	num_rows = new_rows.shape[0]
	logger.info(f"Appended {num_rows} rows to the records")
	else:
	failed.add(city)
	else:
	logger.error(f"Could not download {city}")
	failed.add(city)

	time.sleep(3)

	logger.info("Processing concatenated data...")
	df = df.drop_duplicates(subset=['date', 'point', 'type']).reset_index(drop=True)
	df = df.sort_values(by=['date', 'point', 'type'])
	df = df.reset_index(drop=True)
	df = df[columns]
	logger.info("Writing to csv file...")
	df.to_csv("polen.csv", index=False)
	logger.info("Finished!")
	if failed:
	cities = ", ".join(failed)
	logger.error("Following points failed: {cities}")



	# JSON: http http://gestiona.madrid.org/geoserver/wms\?SERVICE\=WMS\&VERSION\=1.1.1\&REQUEST\=GetFeatureInfo\&LAYERS\=SPOL_V_CAPTADORES_GIS\&QUERY_LAYERS\=SPOL_V_CAPTADORES_GIS\&STYLES\=\&BBOX\=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528\&FEATURE_COUNT\=50\&HEIGHT\=493\&WIDTH\=450\&FORMAT\=image%2Fpng\&INFO_FORMAT\=application%2Fjson\&SRS\=EPSG%3A23030\&X\=238\&Y\=325
No results found