Last active
May 2, 2017 21:28
-
-
Save nyurik/187a455764aaf5ee770fc1f739749075 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
# -*- coding: utf-8 -*- | |
from pywikibot.data.sparql import SparqlQuery | |
import re | |
import overpass | |
import time | |
import os | |
import errno | |
# import csv | |
# import StringIO | |
qnumber = re.compile('^Q[1-9][0-9]{0,8}$') | |
def ensure_path(path): | |
try: | |
os.makedirs(path) | |
except OSError as exception: | |
if exception.errno != errno.EEXIST: | |
raise | |
def get_data(file): | |
with open(file, 'r') as data_file: | |
lines = data_file.read().split('\n') | |
headers = lines[0].split('\t') | |
return headers, [line.split('\t') for line in lines[1:] if len(line) > 0] | |
def load_dict(file): | |
headers, data = get_data(file) | |
return headers, {row[0]: row[1:] for row in data} | |
def save_data(file, headers, data): | |
with open(file, 'w') as data_file: | |
str = '\t'.join(headers) + '\n' + '\n'.join(['\t'.join(v) for v in data]) + '\n' | |
data_file.write(str) | |
def save_dataset(file, data, columns): | |
with open(file, 'w') as data_file: | |
data.sort(key=lambda row: row[columns-1].lower()) | |
str = '[' + '],\n['.join([','.join([('null' if v == '' else '"' + v.replace('"', '\\"') + '"') for v in row[:columns]]) for row in data]) + ']\n' | |
data_file.write(str) | |
def save_dict(file, headers, data): | |
save_data(file, headers, [[k] + v for k, v in sorted(data.items())]) | |
def batches(data, size): | |
data = list(data) | |
for i in range(0, len(data), size): | |
yield data[i:i + size] | |
def is_valid_wd(key): | |
return qnumber.match(key) | |
def fill_from_wikidata(filename, query, func, refreshMissing=False): | |
sparql = SparqlQuery() | |
headers, data = load_dict(filename) | |
for k in data.keys(): | |
if not is_valid_wd(k): | |
print('Bad Wikidata ID: {}'.format(k)) | |
missing = [k for k, v in data.items() if (not v or (refreshMissing and v == ['-'])) and qnumber.match(k)] | |
if len(missing) > 0: | |
print('Processing {} items'.format(len(missing))) | |
count = 0 | |
for batch in batches(missing, 240): | |
ids = ' '.join(['wd:' + s for s in batch]) | |
qry = query.replace('%IDS%', ids) | |
batchSet = set(batch) | |
for row in sparql.select(qry, True): | |
k, v = func(row) | |
data[k] = v | |
count += 1 | |
if k in batchSet: batchSet.remove(k) | |
for k in batchSet: | |
data[k] = ['-'] | |
save_dict(filename, headers, data) | |
print('Processed {} items'.format(count)) | |
rmQuotes = re.compile(r'\t"([^\n\t]*)"(\t|\n)') | |
def download_osm(api, filename, ot_query, result_prefix, sleep=0): | |
print('downloading OSM data for \'{}\' into {}'.format(ot_query, filename)) | |
response = api.Get(ot_query, responseformat='csv(::id,wikidata,wikipedia)', verbosity='meta') | |
# csvreader = csv.reader(StringIO.StringIO(response), delimiter='\t', quotechar='"') | |
# for row in csvreader: | |
response = rmQuotes.sub(r'\t\1\2', response) | |
count = max(response.count('\n') - 1, 0) | |
response = response.strip('\n').replace('\n', '\n' + result_prefix) | |
response = response.replace('@id\twikidata\twikipedia', '', 1).strip() | |
if len(response) > 0: | |
response += '\n' | |
with open(filename, 'a') as data_file: | |
data_file.write(response) | |
print('received {} values'.format(count)) | |
time.sleep(sleep) | |
def get_osm_tags(filename): | |
api = overpass.API(timeout=300) | |
with open(filename, 'w') as data_file: data_file.write('@id\twikidata\twikipedia\n') | |
download_osm(api, filename, 'rel["wikidata"]', 'r') | |
download_osm(api, filename, 'way["wikidata"~"^Q1"]', 'w', 230) | |
download_osm(api, filename, 'way["wikidata"~"^Q2"]', 'w', 230) | |
download_osm(api, filename, 'way["wikidata"~"^Q3"]', 'w', 230) | |
download_osm(api, filename, 'way["wikidata"~"^Q[456789]"]', 'w', 0) | |
download_osm(api, filename, 'node["wikidata"](-89.9999999,-180,0,0)', 'n', 230) | |
download_osm(api, filename, 'node["wikidata"](-89.9999999,0,0,180)', 'n', 230) | |
download_osm(api, filename, 'node["wikidata"](0,-180,89.9999999,0)', 'n', 230) | |
download_osm(api, filename, 'node["wikidata"](0,0,89.9999999,4.9999999)', 'n', 230) | |
download_osm(api, filename, 'node["wikidata"](0,5,89.9999999,9.9999999)', 'n', 300) | |
download_osm(api, filename, 'node["wikidata"](0,10,89.9999999,14.9999999)', 'n', 300) | |
download_osm(api, filename, 'node["wikidata"](0,15,89.9999999,18.9999999)', 'n', 300) | |
download_osm(api, filename, 'node["wikidata"](0,19,89.9999999,29.9999999)', 'n', 300) | |
download_osm(api, filename, 'node["wikidata"](0,30.000001,89.9999999,60)', 'n', 300) | |
download_osm(api, filename, 'node["wikidata"](0,60.000001,89.9999999,90)', 'n', 300) | |
download_osm(api, filename, 'node["wikidata"](0,90.000001,89.9999999,180)', 'n', 0) | |
def column_to_dict(data_file, dictionary_file, index, validator=False): | |
data = get_data(data_file)[1] | |
headers, dct = load_dict(dictionary_file) | |
for row in data: | |
if len(row) <= index: continue | |
val = row[index] | |
if val not in dct and (not validator or validator(val)): | |
dct[val] = [] | |
save_dict(dictionary_file, headers, dct) | |
def populate_p31(filename): | |
print('Populating wikidata items with their instance-of (P31) for {}'.format(filename)) | |
query = ''' | |
SELECT ?id ?instanceOf | |
WHERE { | |
VALUES ?id { %IDS% } | |
{ ?id wdt:P31 ?instanceOf } | |
UNION | |
{ ?id owl:sameAs/wdt:P31 ?instanceOf } | |
}''' | |
fill_from_wikidata(filename, query, lambda row: (row['id'].getID(), [row['instanceOf'].getID()])) | |
def populate_p31info(filename): | |
print('Populating instance-of data with their name, description, and class for {}'.format(filename)) | |
# BIND (EXISTS {?id wdt:P279 wd:Q4167410} as ?isDisambig) | |
query = ''' | |
SELECT ?id ?idLabel ?idDescription ?parentClass ?parentClassLabel | |
WHERE { | |
VALUES ?id { %IDS% } | |
OPTIONAL { ?id wdt:P279 ?parentClass . } | |
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,fr,es,de,it,ru,uk,bg,ro,be,bn,ca,cs,da,el,et,fa,fi,he,hi,hu,hy,id,ja,jv,ko,nb,nl,eo,pa,pl,pt,sh,sk,sr,sv,sw,te,th,tr,yue,vec,vi,zh,ar" . } | |
}''' | |
fill_from_wikidata(filename, query, lambda row: (row['id'].getID(), [ | |
'', # IsBad? | |
row['idLabel'].value if row['idLabel'] else '', | |
row['idDescription'].value if row['idDescription'] else '', | |
row['parentClass'].getID() if row['parentClass'] else '', | |
row['parentClassLabel'].value if row['parentClassLabel'] else '' | |
])) | |
def extract_bad_data(osm_data, osm_to_p31, p31_info, purge_bad=False): | |
if purge_bad: | |
print('Combining data and removing bad data from ' + osm_to_p31) | |
else: | |
print('Combining data and extracting bad data into data/') | |
output_dir = 'data/bad/' | |
ensure_path(output_dir) | |
result = {} | |
data = get_data(osm_data)[1] | |
osm_to_p31_data = load_dict(osm_to_p31) | |
p31lookup = osm_to_p31_data[1] | |
p31info = load_dict(p31_info)[1] | |
for row in data: | |
if row[1] not in p31lookup: continue | |
p31 = p31lookup[row[1]] | |
if len(p31) < 1 or p31[0] not in p31info: continue | |
p31inf = p31info[p31[0]] | |
bad = p31inf[0] | |
if not bad or bad == 'ok': continue | |
if purge_bad: | |
result[row[1]] = True | |
else: | |
if bad not in result: result[bad] = [] | |
row.extend(p31) | |
row.extend(p31inf) | |
result[bad].append(row) | |
if purge_bad: | |
should_save = False | |
for key in result.keys(): | |
try: | |
del p31lookup[key] | |
should_save = True | |
except KeyError: | |
pass | |
if should_save: | |
save_dict(osm_to_p31, osm_to_p31_data[0], p31lookup) | |
else: | |
for bad, lst in result.items(): | |
outfile = output_dir + bad | |
print('Saving bad file {}'.format(outfile)) | |
save_data(outfile + '.tsv', 'osmid\twikidata\twikipedia\tp31\tbad\tname\tdescription\tclass\tclassname'.split('\t'), lst) | |
save_dataset(outfile + '.tab', lst, 3) | |
osm_data = 'data/osm_objects.tsv' | |
osm_to_p31 = 'data/wikidata_with_p31.tsv' | |
p31_info = 'data/p31_info.tsv' | |
# Download all relations, ways, and nodes that contain "wikidata" tag into a file | |
# Copy all wikidata tags into a separate file if they are not there already | |
# Get p31 property for all new wikidata entries | |
# Copy p31 values into a separate file if they are not there already | |
# Add label, description, and p279 label for the p31 values | |
# -- the file data/p31_info.tsv is manually edited to mark all "bad" p31s with a keyword in the 2nd column | |
# Based on the p31_info, copy all "bad" osm objects into separate files, one file per keyword | |
get_osm_tags(osm_data) | |
column_to_dict(osm_data, osm_to_p31, 1, is_valid_wd) | |
populate_p31(osm_to_p31) | |
column_to_dict(osm_to_p31, p31_info, 1, is_valid_wd) | |
populate_p31info(p31_info) | |
extract_bad_data(osm_data, osm_to_p31, p31_info, True) | |
column_to_dict(osm_data, osm_to_p31, 1, is_valid_wd) | |
populate_p31(osm_to_p31) | |
column_to_dict(osm_to_p31, p31_info, 1, is_valid_wd) | |
populate_p31info(p31_info) | |
extract_bad_data(osm_data, osm_to_p31, p31_info) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment