Created
July 12, 2020 12:38
-
-
Save aladagemre/0198813568a236775e4f2c6a46d4fa25 to your computer and use it in GitHub Desktop.
Madrid Polen Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is a script for parsing polen levels in Madrid, Spain. | |
It used to work in 2019 but you may need to make fixes to make it work in the following years. | |
""" | |
import re | |
import locale | |
import time | |
import logging | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import coloredlogs, logging | |
coloredlogs.install() | |
logger = logging.getLogger("Polen Crawler") | |
locale.setlocale(locale.LC_TIME, "es_ES") | |
def chunks(l, n): | |
"""Yield successive n-sized chunks from l.""" | |
for i in range(0, len(l), n): | |
yield tuple(l[i:i + n]) | |
def format_name(name): | |
return name.replace('á', 'á').replace('Ã', 'í') | |
def process_chunk(chunk): | |
type_ = format_name(chunk[0].strip()) | |
amount = chunk[1].strip() | |
amount = re.sub(r'(?<=\d)[,\.](?=\d)','', amount) | |
level, threshold = chunk[2].strip().split(" (") | |
threshold = threshold.strip()[:-1] | |
return (type_, amount, level, threshold) | |
cities = { | |
'Madrid-Arganzuela': 'X=262&Y=299', | |
'Ciudad Universiteria': 'X=249&Y=282', | |
'Las Rozas': 'X=199&Y=252', | |
'Collado Villalba': 'X=171&Y=203', | |
'Alcobendas': 'X=282&Y=242', | |
'Salamanca': 'X=264&Y=279', | |
'Coslada': 'X=304&Y=283', | |
'Alcala de Henares': 'X=357&Y=262', | |
'Getafe': 'X=252&Y=332', | |
'Aranjuez': 'X=290&Y=441' | |
} | |
base = 'http://gestiona.madrid.org/geoserver/wms?SERVICE=WMS&VERSION=1.1.1&REQUEST=GetFeatureInfo&LAYERS=SPOL_V_CAPTADORES_GIS&QUERY_LAYERS=SPOL_V_CAPTADORES_GIS&STYLES=&BBOX=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528&FEATURE_COUNT=50&HEIGHT=493&WIDTH=450&FORMAT=image%2Fpng&INFO_FORMAT=text%2Fhtml&SRS=EPSG%3A23030&' | |
columns = ['date', 'point', 'type', 'amount', 'level', 'threshold'] | |
try: | |
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d') | |
df = pd.read_csv("polen.csv", parse_dates=['date'], date_parser=dateparse) | |
except FileNotFoundError as ex: | |
df = pd.DataFrame(columns=columns) | |
failed = set() | |
for city, arg in cities.items(): | |
logger.info(f"Fetching {city}...") | |
html_doc = requests.get(base + arg).text | |
logger.info(f"Parsing {city}...") | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
results = [label.get_text() for label in soup.find_all('label')] | |
if results: | |
point = results[1].strip() | |
date = results[3].strip() | |
new_rows = pd.DataFrame([[date, point, *process_chunk(chunk)] for chunk in chunks(results[7::], 3)], columns=columns) | |
new_rows.date = pd.to_datetime(new_rows.date, format='%d-%b-%Y') | |
if not new_rows.empty: | |
df = pd.concat([df, new_rows], sort=True) | |
num_rows = new_rows.shape[0] | |
logger.info(f"Appended {num_rows} rows to the records") | |
else: | |
failed.add(city) | |
else: | |
logger.error(f"Could not download {city}") | |
failed.add(city) | |
time.sleep(3) | |
logger.info("Processing concatenated data...") | |
df = df.drop_duplicates(subset=['date', 'point', 'type']).reset_index(drop=True) | |
df = df.sort_values(by=['date', 'point', 'type']) | |
df = df.reset_index(drop=True) | |
df = df[columns] | |
logger.info("Writing to csv file...") | |
df.to_csv("polen.csv", index=False) | |
logger.info("Finished!") | |
if failed: | |
cities = ", ".join(failed) | |
logger.error("Following points failed: {cities}") | |
# JSON: http http://gestiona.madrid.org/geoserver/wms\?SERVICE\=WMS\&VERSION\=1.1.1\&REQUEST\=GetFeatureInfo\&LAYERS\=SPOL_V_CAPTADORES_GIS\&QUERY_LAYERS\=SPOL_V_CAPTADORES_GIS\&STYLES\=\&BBOX\=365560.97254%2C4415910.465472%2C495339.02746%2C4558089.534528\&FEATURE_COUNT\=50\&HEIGHT\=493\&WIDTH\=450\&FORMAT\=image%2Fpng\&INFO_FORMAT\=application%2Fjson\&SRS\=EPSG%3A23030\&X\=238\&Y\=325 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment