Skip to content

Instantly share code, notes, and snippets.

@eskerda
Created June 22, 2017 19:18
Show Gist options
  • Save eskerda/4a1623b9aa7834e10a3127e618162770 to your computer and use it in GitHub Desktop.
Save eskerda/4a1623b9aa7834e10a3127e618162770 to your computer and use it in GitHub Desktop.
import sys
import json
import logging
import argparse
from urlparse import urljoin
from difflib import get_close_matches
import requests
import geocoder
from lxml import html
from slugify import slugify
import pybikes
from pybikes.bicincitta import Bicincitta
from citybikes.utils import dist_sort
proxy = {
'http': 'http://127.0.0.1:8118'
}
parser = argparse.ArgumentParser()
parser.add_argument("-o", metavar="output", dest="output",
type=argparse.FileType('w'),
default=sys.stdout, help="Output file")
args = parser.parse_args()
# create logger
logger = logging.getLogger('bicincitta_matcher')
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
class DoesNotWorkException(Exception):
pass
def get_feeds():
feeds_url = "http://www.bicincitta.com/css/Portal_11/Pages/frmCercaComune.aspx"
dom = html.fromstring(requests.get(feeds_url, proxies=proxy).text)
cities = dom.xpath("""
//div[@class="comuniNew"]//a
""")
for city in cities:
yield (city.text, urljoin(feeds_url, city.get('href')))
def match_pybikes_to_feed(meta, cities):
match = get_close_matches(meta['city'], cities.keys(), 1, 0.7)
if not match:
return None
return (match[0], cities[match[0]])
def match_feed_to_pybikes(name, url, instances):
instance = Bicincitta("", {}, url)
try:
instance.update()
except IndexError:
raise DoesNotWorkException("Does not work")
lat, lng = (instance.stations[0].latitude,
instance.stations[0].longitude)
# Get nearest network to station
sorted_instances = dist_sort([lat, lng], instances,
lambda i: (i['meta']['latitude'], i['meta']['longitude']))
match, distance = sorted_instances[0]
if distance < 0.1:
return match
return None
def generate_instance(name, url, tag, meta):
return {
'meta': meta,
'tag': tag,
'url': url
}
if __name__ == "__main__":
instances = [i for cls, i in pybikes.get_instances('bicincitta')]
cities = {name: url for name, url in get_feeds()}
# Clone so we can delete already added stuff
# This makes it impossible to match with something already matched
_cities = dict(cities)
_instances = list(instances)
matches = []
dont_work = []
new_cities = []
logger.info("Current stats")
logger.info("--------------")
logger.info("Matched instances: %d" % len(matches))
logger.info("Cities to match: %d" % len(_cities))
logger.info("Instances to match: %d" % len(_instances))
logger.info("Non working cities: %d" % len(dont_work))
logger.info("")
logger.info("Matching instances to bicincitta feeds")
# Match existing instances
for meta in instances:
match = match_pybikes_to_feed(meta['meta'], _cities)
if not match:
logger.error("Found no match for %s", meta)
continue
# Remove from cities
_cities.pop(match[0])
idx = next(
(idx for idx, i in enumerate(_instances)
if i['tag'] == meta['tag']), None
)
del _instances[idx]
# Generate new instance for this, keeping old meta
matches.append(generate_instance(match[0], match[1], meta['tag'],
meta['meta']))
logger.debug("Found match %s and %s", match[0], meta)
logger.info("Current stats")
logger.info("--------------")
logger.info("Matched instances: %d" % len(matches))
logger.info("Cities to match: %d" % len(_cities))
logger.info("Instances to match: %d" % len(_instances))
logger.info("Non working cities: %d" % len(dont_work))
logger.info("Matching bicincitta feeds to non-matched instances")
# Try to fine grain match cities with remaining unmatched instances
for name, url in _cities.items():
try:
match = match_feed_to_pybikes(name, url, _instances)
except DoesNotWorkException:
dont_work.append((name, url))
_cities.pop(name)
logger.error("%s , %s", name, url)
continue
if not match:
continue
_cities.pop(name)
idx = next(
(idx for idx, i in enumerate(_instances)
if i['tag'] == match['tag']), None
)
del _instances[idx]
logger.debug("Found match %s with %s", name, match)
matches.append(generate_instance(name, url, match['tag'],
match['meta']))
logger.info("Current stats")
logger.info("--------------")
logger.info("Matched instances: %d" % len(matches))
logger.info("Cities to match: %d" % len(_cities))
logger.info("Instances to match: %d" % len(_instances))
logger.info("Non working cities: %d" % len(dont_work))
# At this point we have tried our best to match all our instances.
# Let's generate an instance for all new feeds!
logger.info("Generating new instances")
for name, url in _cities.items():
# First test that it works
try:
instance = Bicincitta(name, {}, url)
instance.update()
except:
logger.error("%s: %s" % (name, url))
_cities.pop(name)
dont_work.append((name, url))
continue
# Geocode info
sample = instance.stations.pop()
# Leave empty on purpose, to use filler script
tag = 'bicincitta-%s' % slugify(name)
new_cities.append(generate_instance(name, url, tag, {}))
logger.debug("Generating instance for %s %s", name, url)
_cities.pop(name)
out_data = {
'system': 'bicincitta',
'class': 'Bicincitta',
'instances': matches + new_cities
}
# Return matches
args.output.write(json.dumps(out_data, sort_keys=False, indent=4,
separators=(',', ':')))
logger.info("--------------")
logger.info("Final stats")
logger.info("--------------")
logger.info("Matched instances: %d" % len(matches))
logger.info("New feeds: %d", len(new_cities))
logger.info("Cities to match: %d" % len(_cities))
logger.info("Instances to match: %d" % len(_instances))
logger.info("Non working cities: %d" % len(dont_work))
logger.info("--------------")
logger.error("Non working urls:")
for name, url in dont_work:
logger.error("> %s: %s", name, url)
logger.error("Unmatched instances")
for i in _instances:
logger.error(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment