Created
March 9, 2011 17:24
-
-
Save cimi/862587 to your computer and use it in GitHub Desktop.
Enriches geonames.org information with local city names drawn from Google
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import urllib2 | |
import json | |
import pprint | |
import codecs | |
import sqlite3 | |
import sys | |
import logging | |
import time | |
# range of cities to attempt geocoding | |
if len(sys.argv) < 3: | |
print "You must provide the range of ids to geocode" | |
exit() | |
RANGE_START = int(sys.argv[1]) | |
RANGE_END = int(sys.argv[2]) | |
# logging configuration | |
LOG_FILENAME = 'populate-' + str(RANGE_START) + '-' + str(RANGE_END) + '.log' | |
logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG) | |
# list of languages supported by the google geocoding service | |
google_languages = [ | |
"ar", "eu", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", | |
"eu", "fa", "fi", "fil", "fr", "gl", "gu", "hi", "hr", "hu", "id", | |
"it", "iw", "ja", "kn", "ko", "lt", "lv", "ml", "mr", "nl", "no", | |
"pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "tl", "ta", "te", | |
"th", "tr", "uk", "vi", "zh-CN", "zh-TW" | |
] | |
bing_languages = [ | |
"cs", "da", "nl", "fi", "fr", "de", "it", "ja", "nb", "pt", "es", "sv", "en" | |
] | |
bing_locales = { | |
"cs" : "cs-CZ", | |
"da" : "da-DK", | |
"nl" : "nl-NL", | |
"fi" : "fi-FI", | |
"de" : "de-DE", | |
"it" : "it-IT", | |
"ja" : "ja-JP", | |
"nb" : "nb-NO", | |
"pt" : "pt-PT", | |
"es" : "es-ES", | |
"sv" : "sv-SE", | |
"en" : "en-US" | |
} | |
def init_db(cities_file, connection): | |
""" Reads the geonames file and puts the contents inside a database """ | |
c = connection.cursor() | |
c.execute('''DROP TABLE cities;''') | |
c.execute(''' | |
CREATE TABLE IF NOT EXISTS cities ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
geoname_id INTEGER, | |
geoname STRING, | |
ascii_name STRING, | |
local_name STRING, | |
en_name STRING, | |
alternate_names STRING, | |
cc STRING, | |
lat FLOAT, | |
lon FLOAT | |
); ''') | |
# column names for the cities file - city characteristics taken from geonames | |
columns = cities_file.readline().strip().split('\t') | |
# array for storing all the cities | |
cities = {} | |
for line in cities_file: | |
# populate city objects with each of the columns | |
tokens = line.strip().split('\t') | |
city = {} | |
for idx,column in enumerate(columns): | |
city[column] = tokens[idx] | |
if column == 'alternate_names' or column == 'name': | |
city[column] = city[column].decode('utf-8') | |
# insert into DB | |
sql = 'INSERT INTO cities (geoname_id, geoname, ascii_name, alternate_names, cc, lat, lon) VALUES (:geoname, :name, :ascii_name, :alternate_names, :cc, :lat, :lon)'; | |
cur = c.execute(sql, city) | |
connection.commit() | |
sql = 'SELECT last_insert_rowid()' | |
res = c.execute(sql) | |
for row in c: | |
city['id'] = row[0] | |
cities[city['id']] = city | |
c.close() | |
return cities | |
def read_db(connection): | |
""" Reads the contents of the database file into memory and returns a dictionary of the form id => city """ | |
cities = {} | |
c = connection.cursor() | |
c.execute('''SELECT * FROM cities;''') | |
columns = ["id", "geoname_id", "geoname", "ascii_name", "local_name", "en_name", "alternate_names", "cc", "lat", "lon"] | |
for row in c: | |
city = {} | |
for idx,x in enumerate(row): | |
city[columns[idx]] = x | |
cities[city['id']] = city | |
logging.debug(str(len(cities.keys())) + " cities loaded from database") | |
return cities | |
def dump_db(cities_file, connection): | |
""" Dumps the contents of the SQL database to disk in a plaintext file (tab separated) """ | |
c = connection.cursor() | |
c.execute('''SELECT * FROM cities;''') | |
columns = ["id", "geoname_id", "geoname", "ascii_name", "local_name", "en_name", "alternate_names", "cc", "lat", "lon"] | |
cities_file.write("\t".join(columns) + "\n") | |
for row in c: | |
cities_file.write("\t".join(unicode(x).encode('utf-8') for x in row) + "\n") | |
def get_languages(countries_file): | |
""" Gets the languages associated with a country from the countries resource file """ | |
countries = {} | |
for line in countries_file: | |
tokens = line.split('\t'); | |
if tokens[1]: | |
languages = tokens[-1].strip().split(',') | |
default_languages = [] | |
for language in languages: | |
default_languages.append(language.split('-')[0].strip("\"")) | |
countries[tokens[1]] = default_languages | |
return countries | |
def get_local_name(city, languages, connection): | |
""" Determine a city's local name from the official language of it's country """ | |
# if the city name has already been queried for, return it directly | |
if city['local_name'] != None and city['local_name'] != '#bing' and city['local_name'] != '#retry': | |
logging.debug("City was already in db: " + city['ascii_name'] + " == " + city['local_name']) | |
return city['local_name'] | |
# select the default language | |
language = None | |
if city['cc'] in languages: | |
language = languages[city['cc']][0] | |
else: | |
language = "language not available" | |
if language in bing_languages: | |
try: | |
url = "http://dev.virtualearth.net/REST/v1/Locations/{1}?o=json&key=AlBXPeA6_Mj8OIvLRJvHvx7EYhf9URHvtnK4Oy30LfvVSUK0Mlm7NNef4IhcqJVi&c={0}".format(bing_locales[language], urllib.quote(city['ascii_name'])) | |
logging.debug("Requesting " + url) | |
# wait one second before performing the actual request so that we don't get rejected by Google since we're not paying | |
time.sleep(1) | |
geocoding = json.loads(urllib2.urlopen(url).read()) | |
if (geocoding['resourceSets']): | |
address = geocoding['resourceSets'][0]['resources'][0]['address'] | |
city['local_name'] = address['locality'] if 'locality' in address.keys() else geocoding['resourceSets'][0]['resources'][0]['name'] | |
logging.debug("Found " + city['local_name']) | |
else: | |
city['local_name'] = '#not_found' | |
logging.warn("City not found " + city['ascii_name']) | |
except urllib2.URLError: | |
# if the service rejected the error, log the occurrence | |
city['local_name'] = '#retry' | |
logging.warn("Request rejected " + url) | |
time.sleep(10) | |
except KeyError: | |
city['local_name'] = '#not_found' | |
logging.warn("City not found " + city['ascii_name']) | |
elif language in google_languages: | |
try: | |
url = "http://maps.googleapis.com/maps/api/geocode/json?language={0}&address={1}&sensor=false".format(language, urllib.quote(city['ascii_name'])) | |
logging.debug("Requesting " + url) | |
# wait one second before performing the actual request so that we don't get rejected by Google since we're not paying | |
time.sleep(1) | |
geocoding = json.loads(urllib2.urlopen(url).read()) | |
if geocoding['results']: | |
city['local_name'] = geocoding['results'][0]['address_components'][0]['long_name'] | |
logging.debug("Found " + city['local_name']) | |
else: | |
city['local_name'] = '#not_found' | |
logging.warn("City not found " + city['ascii_name']) | |
except urllib2.URLError: | |
# if the service rejected the error, log the occurrence | |
city['local_name'] = '#retry' | |
logging.warn("Request rejected " + url) | |
time.sleep(10) | |
else: | |
# if the language is not supported by the geocoding provider, log the occurence | |
city['local_name'] = '#not supported' | |
logging.warn("Language not supported for " + city['ascii_name'] + ", " + language) | |
# if a local name could be determined, insert it in the db | |
if city['local_name']: | |
sql = "UPDATE cities SET local_name = :local_name WHERE id = :id" | |
c = connection.cursor() | |
c.execute(sql, city) | |
connection.commit() | |
conn = sqlite3.connect('cities.db') | |
# cities = init_db(open('cities15000.tsv'), conn) | |
languages = get_languages(open('countries.tsv')) | |
cities = read_db(conn) | |
for id,city in cities.items(): | |
if id in range(RANGE_START, RANGE_END): | |
city['local_name'] = get_local_name(city, languages, conn) | |
dump_db(open('cities_out.tsv', 'w'), conn) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment