Created
June 22, 2017 19:18
-
-
Save eskerda/4a1623b9aa7834e10a3127e618162770 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import logging | |
import argparse | |
from urlparse import urljoin | |
from difflib import get_close_matches | |
import requests | |
import geocoder | |
from lxml import html | |
from slugify import slugify | |
import pybikes | |
from pybikes.bicincitta import Bicincitta | |
from citybikes.utils import dist_sort | |
proxy = { | |
'http': 'http://127.0.0.1:8118' | |
} | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-o", metavar="output", dest="output", | |
type=argparse.FileType('w'), | |
default=sys.stdout, help="Output file") | |
args = parser.parse_args() | |
# create logger | |
logger = logging.getLogger('bicincitta_matcher') | |
logger.setLevel(logging.DEBUG) | |
# create console handler and set level to debug | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.DEBUG) | |
# create formatter | |
formatter = logging.Formatter('%(levelname)s - %(message)s') | |
# add formatter to ch | |
ch.setFormatter(formatter) | |
# add ch to logger | |
logger.addHandler(ch) | |
class DoesNotWorkException(Exception): | |
pass | |
def get_feeds(): | |
feeds_url = "http://www.bicincitta.com/css/Portal_11/Pages/frmCercaComune.aspx" | |
dom = html.fromstring(requests.get(feeds_url, proxies=proxy).text) | |
cities = dom.xpath(""" | |
//div[@class="comuniNew"]//a | |
""") | |
for city in cities: | |
yield (city.text, urljoin(feeds_url, city.get('href'))) | |
def match_pybikes_to_feed(meta, cities): | |
match = get_close_matches(meta['city'], cities.keys(), 1, 0.7) | |
if not match: | |
return None | |
return (match[0], cities[match[0]]) | |
def match_feed_to_pybikes(name, url, instances): | |
instance = Bicincitta("", {}, url) | |
try: | |
instance.update() | |
except IndexError: | |
raise DoesNotWorkException("Does not work") | |
lat, lng = (instance.stations[0].latitude, | |
instance.stations[0].longitude) | |
# Get nearest network to station | |
sorted_instances = dist_sort([lat, lng], instances, | |
lambda i: (i['meta']['latitude'], i['meta']['longitude'])) | |
match, distance = sorted_instances[0] | |
if distance < 0.1: | |
return match | |
return None | |
def generate_instance(name, url, tag, meta): | |
return { | |
'meta': meta, | |
'tag': tag, | |
'url': url | |
} | |
if __name__ == "__main__": | |
instances = [i for cls, i in pybikes.get_instances('bicincitta')] | |
cities = {name: url for name, url in get_feeds()} | |
# Clone so we can delete already added stuff | |
# This makes it impossible to match with something already matched | |
_cities = dict(cities) | |
_instances = list(instances) | |
matches = [] | |
dont_work = [] | |
new_cities = [] | |
logger.info("Current stats") | |
logger.info("--------------") | |
logger.info("Matched instances: %d" % len(matches)) | |
logger.info("Cities to match: %d" % len(_cities)) | |
logger.info("Instances to match: %d" % len(_instances)) | |
logger.info("Non working cities: %d" % len(dont_work)) | |
logger.info("") | |
logger.info("Matching instances to bicincitta feeds") | |
# Match existing instances | |
for meta in instances: | |
match = match_pybikes_to_feed(meta['meta'], _cities) | |
if not match: | |
logger.error("Found no match for %s", meta) | |
continue | |
# Remove from cities | |
_cities.pop(match[0]) | |
idx = next( | |
(idx for idx, i in enumerate(_instances) | |
if i['tag'] == meta['tag']), None | |
) | |
del _instances[idx] | |
# Generate new instance for this, keeping old meta | |
matches.append(generate_instance(match[0], match[1], meta['tag'], | |
meta['meta'])) | |
logger.debug("Found match %s and %s", match[0], meta) | |
logger.info("Current stats") | |
logger.info("--------------") | |
logger.info("Matched instances: %d" % len(matches)) | |
logger.info("Cities to match: %d" % len(_cities)) | |
logger.info("Instances to match: %d" % len(_instances)) | |
logger.info("Non working cities: %d" % len(dont_work)) | |
logger.info("Matching bicincitta feeds to non-matched instances") | |
# Try to fine grain match cities with remaining unmatched instances | |
for name, url in _cities.items(): | |
try: | |
match = match_feed_to_pybikes(name, url, _instances) | |
except DoesNotWorkException: | |
dont_work.append((name, url)) | |
_cities.pop(name) | |
logger.error("%s , %s", name, url) | |
continue | |
if not match: | |
continue | |
_cities.pop(name) | |
idx = next( | |
(idx for idx, i in enumerate(_instances) | |
if i['tag'] == match['tag']), None | |
) | |
del _instances[idx] | |
logger.debug("Found match %s with %s", name, match) | |
matches.append(generate_instance(name, url, match['tag'], | |
match['meta'])) | |
logger.info("Current stats") | |
logger.info("--------------") | |
logger.info("Matched instances: %d" % len(matches)) | |
logger.info("Cities to match: %d" % len(_cities)) | |
logger.info("Instances to match: %d" % len(_instances)) | |
logger.info("Non working cities: %d" % len(dont_work)) | |
# At this point we have tried our best to match all our instances. | |
# Let's generate an instance for all new feeds! | |
logger.info("Generating new instances") | |
for name, url in _cities.items(): | |
# First test that it works | |
try: | |
instance = Bicincitta(name, {}, url) | |
instance.update() | |
except: | |
logger.error("%s: %s" % (name, url)) | |
_cities.pop(name) | |
dont_work.append((name, url)) | |
continue | |
# Geocode info | |
sample = instance.stations.pop() | |
# Leave empty on purpose, to use filler script | |
tag = 'bicincitta-%s' % slugify(name) | |
new_cities.append(generate_instance(name, url, tag, {})) | |
logger.debug("Generating instance for %s %s", name, url) | |
_cities.pop(name) | |
out_data = { | |
'system': 'bicincitta', | |
'class': 'Bicincitta', | |
'instances': matches + new_cities | |
} | |
# Return matches | |
args.output.write(json.dumps(out_data, sort_keys=False, indent=4, | |
separators=(',', ':'))) | |
logger.info("--------------") | |
logger.info("Final stats") | |
logger.info("--------------") | |
logger.info("Matched instances: %d" % len(matches)) | |
logger.info("New feeds: %d", len(new_cities)) | |
logger.info("Cities to match: %d" % len(_cities)) | |
logger.info("Instances to match: %d" % len(_instances)) | |
logger.info("Non working cities: %d" % len(dont_work)) | |
logger.info("--------------") | |
logger.error("Non working urls:") | |
for name, url in dont_work: | |
logger.error("> %s: %s", name, url) | |
logger.error("Unmatched instances") | |
for i in _instances: | |
logger.error(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment