Last active
July 20, 2016 21:37
-
-
Save migurski/4a5821699e1ed519a604 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
''' Guess at ISO-3166-2 admin 1 codes for Natural Earth data. | |
Uses subdivision names from Ola Holmström’s iso-3166-2: | |
https://github.com/olahol/iso-3166-2.js/blob/master/data.csv | |
ne_10m_admin_1_states_provinces.csv is a direct CSV | |
export of Natural Earth admin-1 data as of Nov 8, 2012: | |
https://github.com/nvkelso/natural-earth-vector/tree/c97860331/10m_cultural | |
Generates out.csv, which can be joined back to Natural Earth on adm1_code. | |
''' | |
from unicodecsv import reader, DictReader, writer | |
from unidecode import unidecode | |
from operator import itemgetter | |
from itertools import groupby | |
from fuzzywuzzy import fuzz | |
from re import split | |
def unidecode_lower(string): | |
return unidecode(string).lower() | |
with open('olahol-iso-3166-2.csv') as file: | |
iso_3166_2_rows = list() | |
for row in reader(file): | |
# Afghanistan,AF-BDS,Badakhshān,Province,Badaẖšan,AF-BDS*,12267,2,AF,AFG | |
country, iso_3166_2, name, _, name_variants, _, _, _, iso_a2, _ = row | |
names = split(r'(?: \[|\])', name) + name_variants.split(', ') | |
names = set(map(unidecode_lower, filter(None, names))) | |
iso_3166_2_rows.append(dict(country=country, iso_3166_2=iso_3166_2, | |
name=name, name_variants=name_variants, | |
iso_a2=iso_a2, names=names)) | |
with open('ne_10m_admin_1_states_provinces.csv') as file: | |
ne_admin1_rows = list() | |
iso_3166_1s = set() | |
for row in DictReader(file): | |
# adm1_code,iso_a2,iso_3166_2,name,name_alt,name_local | |
names = [row['name'], row['name_local']] + row['name_alt'].split('|') | |
row['names'] = set(map(unidecode_lower, filter(None, names))) | |
ne_admin1_rows.append(row) | |
iso_3166_1s.add(row['iso_a2']) | |
for iso_a2 in iso_3166_1s: | |
#if iso_a2 not in ('CM', 'RU', 'PL'): continue | |
# Pick out the Natural Earth entries for this country only missing an ISO-3166-2 code. | |
_ne_admin1_rows = [R for R in ne_admin1_rows if R['iso_a2'] == iso_a2 and R['iso_3166_2'].endswith('-')] | |
_iso_3166_2_rows = [R for R in iso_3166_2_rows if R['iso_a2'] == iso_a2] | |
print iso_a2 | |
# | |
# Start with a high minimum cutoff, and work downwards | |
# so that exact matches are found as early as possible. | |
# | |
for minimum_ratio in range(100, 79, -5): | |
for ne_admin1_row in _ne_admin1_rows: | |
best_ratio, best_iso_row = minimum_ratio, None | |
for iso_3166_2_row in _iso_3166_2_rows: | |
for ne_name in ne_admin1_row['names']: | |
for iso_name in iso_3166_2_row['names']: | |
ratio = fuzz.ratio(ne_name, iso_name) | |
if ratio > best_ratio: | |
best_ratio, best_iso_row = ratio, iso_3166_2_row | |
if best_iso_row: | |
print best_ratio, ne_admin1_row['names'], best_iso_row['names'], best_iso_row['iso_3166_2'] | |
ne_admin1_row['iso_3166_2'] = best_iso_row['iso_3166_2'] | |
_iso_3166_2_rows.remove(best_iso_row) | |
_ne_admin1_rows.remove(ne_admin1_row) | |
with open('out.csv', 'w') as file: | |
cols = 'adm1_code,iso_a2,iso_3166_2,name,name_alt,name_local'.split(',') | |
out = writer(file) | |
out.writerow(cols) | |
for row in ne_admin1_rows: | |
out.writerow([row[col] for col in cols]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment