Last active
February 20, 2018 04:58
-
-
Save jayvdb/d1d870cf0e5f09837fd78cf82cce276b to your computer and use it in GitHub Desktop.
Indonesian Regency code comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Indonesia regency code comparison. | |
Provinces differ for two codes, 90s, covering Papua and Papua Barat. | |
More information provinces at: | |
https://gitlab.com/ciptamedia/ciptamedia-dev.gitlab.io/issues/104#note_59625585 | |
This script is to compare the regencies to determine how much they overlapped. | |
""" | |
import csv | |
import os | |
def load_csv(filename, fieldnames=None): | |
with open(filename) as f: | |
reader = csv.DictReader(f, fieldnames=fieldnames) | |
return list(reader) | |
def get_wdq_regencies(): | |
# This file is export of http://tinyurl.com/y9nacntd | |
filename = os.path.join('..', 'wdq-regencies.csv') | |
data = list(load_csv(filename)) | |
return data | |
def get_bps_regencies(): | |
# This file is from | |
# https://github.com/edwardsamuel/Wilayah-Administratif-Indonesia/blob/master/csv/regencies.csv | |
filename = os.path.join('..', 'bps-regencies.csv') | |
data = list(load_csv(filename, | |
fieldnames=['code', 'province_code', 'name'])) | |
return data | |
def check_regency_lists(): | |
wdq_data = get_wdq_regencies() | |
bps_data = get_bps_regencies() | |
wdq_dict = {} | |
for row in wdq_data: | |
wdq_dict[row['rawcode']] = row | |
ok = not_ok = 0 | |
for row in bps_data: | |
code = row['code'] | |
if code not in wdq_dict: | |
print('Code %s missing from Wikidata' % code) | |
continue | |
name = row['name'].lower() | |
name = name.replace('kabupaten', '') | |
name = name.strip() | |
name = name.replace('-', '') | |
name = name.replace(' ', '') | |
wdq_row = wdq_dict[code] | |
wd_name = wdq_row['itemLabel'].lower() | |
wd_name = wd_name.replace('-', '') | |
wd_name = wd_name.replace(' ', '') | |
if wd_name == name: | |
ok = ok + 1 | |
print('%s ok' % wdq_row['itemLabel']) | |
else: | |
not_ok = not_ok + 1 | |
print('%s != %s' % (wdq_row['itemLabel'], row['name'])) | |
print('Matches: %d' % ok) | |
print('Not ok: %d' % not_ok) | |
def main(argv=None): | |
check_regency_lists() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment