Skip to content

Instantly share code, notes, and snippets.

@nyurik
Last active May 2, 2017 21:28
Show Gist options
  • Save nyurik/187a455764aaf5ee770fc1f739749075 to your computer and use it in GitHub Desktop.
Save nyurik/187a455764aaf5ee770fc1f739749075 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/python3
# -*- coding: utf-8 -*-
from pywikibot.data.sparql import SparqlQuery
import re
import overpass
import time
import os
import errno
# import csv
# import StringIO
qnumber = re.compile('^Q[1-9][0-9]{0,8}$')
def ensure_path(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
def get_data(file):
with open(file, 'r') as data_file:
lines = data_file.read().split('\n')
headers = lines[0].split('\t')
return headers, [line.split('\t') for line in lines[1:] if len(line) > 0]
def load_dict(file):
headers, data = get_data(file)
return headers, {row[0]: row[1:] for row in data}
def save_data(file, headers, data):
with open(file, 'w') as data_file:
str = '\t'.join(headers) + '\n' + '\n'.join(['\t'.join(v) for v in data]) + '\n'
data_file.write(str)
def save_dataset(file, data, columns):
with open(file, 'w') as data_file:
data.sort(key=lambda row: row[columns-1].lower())
str = '[' + '],\n['.join([','.join([('null' if v == '' else '"' + v.replace('"', '\\"') + '"') for v in row[:columns]]) for row in data]) + ']\n'
data_file.write(str)
def save_dict(file, headers, data):
save_data(file, headers, [[k] + v for k, v in sorted(data.items())])
def batches(data, size):
data = list(data)
for i in range(0, len(data), size):
yield data[i:i + size]
def is_valid_wd(key):
return qnumber.match(key)
def fill_from_wikidata(filename, query, func, refreshMissing=False):
sparql = SparqlQuery()
headers, data = load_dict(filename)
for k in data.keys():
if not is_valid_wd(k):
print('Bad Wikidata ID: {}'.format(k))
missing = [k for k, v in data.items() if (not v or (refreshMissing and v == ['-'])) and qnumber.match(k)]
if len(missing) > 0:
print('Processing {} items'.format(len(missing)))
count = 0
for batch in batches(missing, 240):
ids = ' '.join(['wd:' + s for s in batch])
qry = query.replace('%IDS%', ids)
batchSet = set(batch)
for row in sparql.select(qry, True):
k, v = func(row)
data[k] = v
count += 1
if k in batchSet: batchSet.remove(k)
for k in batchSet:
data[k] = ['-']
save_dict(filename, headers, data)
print('Processed {} items'.format(count))
rmQuotes = re.compile(r'\t"([^\n\t]*)"(\t|\n)')
def download_osm(api, filename, ot_query, result_prefix, sleep=0):
print('downloading OSM data for \'{}\' into {}'.format(ot_query, filename))
response = api.Get(ot_query, responseformat='csv(::id,wikidata,wikipedia)', verbosity='meta')
# csvreader = csv.reader(StringIO.StringIO(response), delimiter='\t', quotechar='"')
# for row in csvreader:
response = rmQuotes.sub(r'\t\1\2', response)
count = max(response.count('\n') - 1, 0)
response = response.strip('\n').replace('\n', '\n' + result_prefix)
response = response.replace('@id\twikidata\twikipedia', '', 1).strip()
if len(response) > 0:
response += '\n'
with open(filename, 'a') as data_file:
data_file.write(response)
print('received {} values'.format(count))
time.sleep(sleep)
def get_osm_tags(filename):
api = overpass.API(timeout=300)
with open(filename, 'w') as data_file: data_file.write('@id\twikidata\twikipedia\n')
download_osm(api, filename, 'rel["wikidata"]', 'r')
download_osm(api, filename, 'way["wikidata"~"^Q1"]', 'w', 230)
download_osm(api, filename, 'way["wikidata"~"^Q2"]', 'w', 230)
download_osm(api, filename, 'way["wikidata"~"^Q3"]', 'w', 230)
download_osm(api, filename, 'way["wikidata"~"^Q[456789]"]', 'w', 0)
download_osm(api, filename, 'node["wikidata"](-89.9999999,-180,0,0)', 'n', 230)
download_osm(api, filename, 'node["wikidata"](-89.9999999,0,0,180)', 'n', 230)
download_osm(api, filename, 'node["wikidata"](0,-180,89.9999999,0)', 'n', 230)
download_osm(api, filename, 'node["wikidata"](0,0,89.9999999,4.9999999)', 'n', 230)
download_osm(api, filename, 'node["wikidata"](0,5,89.9999999,9.9999999)', 'n', 300)
download_osm(api, filename, 'node["wikidata"](0,10,89.9999999,14.9999999)', 'n', 300)
download_osm(api, filename, 'node["wikidata"](0,15,89.9999999,18.9999999)', 'n', 300)
download_osm(api, filename, 'node["wikidata"](0,19,89.9999999,29.9999999)', 'n', 300)
download_osm(api, filename, 'node["wikidata"](0,30.000001,89.9999999,60)', 'n', 300)
download_osm(api, filename, 'node["wikidata"](0,60.000001,89.9999999,90)', 'n', 300)
download_osm(api, filename, 'node["wikidata"](0,90.000001,89.9999999,180)', 'n', 0)
def column_to_dict(data_file, dictionary_file, index, validator=False):
data = get_data(data_file)[1]
headers, dct = load_dict(dictionary_file)
for row in data:
if len(row) <= index: continue
val = row[index]
if val not in dct and (not validator or validator(val)):
dct[val] = []
save_dict(dictionary_file, headers, dct)
def populate_p31(filename):
print('Populating wikidata items with their instance-of (P31) for {}'.format(filename))
query = '''
SELECT ?id ?instanceOf
WHERE {
VALUES ?id { %IDS% }
{ ?id wdt:P31 ?instanceOf }
UNION
{ ?id owl:sameAs/wdt:P31 ?instanceOf }
}'''
fill_from_wikidata(filename, query, lambda row: (row['id'].getID(), [row['instanceOf'].getID()]))
def populate_p31info(filename):
print('Populating instance-of data with their name, description, and class for {}'.format(filename))
# BIND (EXISTS {?id wdt:P279 wd:Q4167410} as ?isDisambig)
query = '''
SELECT ?id ?idLabel ?idDescription ?parentClass ?parentClassLabel
WHERE {
VALUES ?id { %IDS% }
OPTIONAL { ?id wdt:P279 ?parentClass . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,fr,es,de,it,ru,uk,bg,ro,be,bn,ca,cs,da,el,et,fa,fi,he,hi,hu,hy,id,ja,jv,ko,nb,nl,eo,pa,pl,pt,sh,sk,sr,sv,sw,te,th,tr,yue,vec,vi,zh,ar" . }
}'''
fill_from_wikidata(filename, query, lambda row: (row['id'].getID(), [
'', # IsBad?
row['idLabel'].value if row['idLabel'] else '',
row['idDescription'].value if row['idDescription'] else '',
row['parentClass'].getID() if row['parentClass'] else '',
row['parentClassLabel'].value if row['parentClassLabel'] else ''
]))
def extract_bad_data(osm_data, osm_to_p31, p31_info, purge_bad=False):
if purge_bad:
print('Combining data and removing bad data from ' + osm_to_p31)
else:
print('Combining data and extracting bad data into data/')
output_dir = 'data/bad/'
ensure_path(output_dir)
result = {}
data = get_data(osm_data)[1]
osm_to_p31_data = load_dict(osm_to_p31)
p31lookup = osm_to_p31_data[1]
p31info = load_dict(p31_info)[1]
for row in data:
if row[1] not in p31lookup: continue
p31 = p31lookup[row[1]]
if len(p31) < 1 or p31[0] not in p31info: continue
p31inf = p31info[p31[0]]
bad = p31inf[0]
if not bad or bad == 'ok': continue
if purge_bad:
result[row[1]] = True
else:
if bad not in result: result[bad] = []
row.extend(p31)
row.extend(p31inf)
result[bad].append(row)
if purge_bad:
should_save = False
for key in result.keys():
try:
del p31lookup[key]
should_save = True
except KeyError:
pass
if should_save:
save_dict(osm_to_p31, osm_to_p31_data[0], p31lookup)
else:
for bad, lst in result.items():
outfile = output_dir + bad
print('Saving bad file {}'.format(outfile))
save_data(outfile + '.tsv', 'osmid\twikidata\twikipedia\tp31\tbad\tname\tdescription\tclass\tclassname'.split('\t'), lst)
save_dataset(outfile + '.tab', lst, 3)
osm_data = 'data/osm_objects.tsv'
osm_to_p31 = 'data/wikidata_with_p31.tsv'
p31_info = 'data/p31_info.tsv'
# Download all relations, ways, and nodes that contain "wikidata" tag into a file
# Copy all wikidata tags into a separate file if they are not there already
# Get p31 property for all new wikidata entries
# Copy p31 values into a separate file if they are not there already
# Add label, description, and p279 label for the p31 values
# -- the file data/p31_info.tsv is manually edited to mark all "bad" p31s with a keyword in the 2nd column
# Based on the p31_info, copy all "bad" osm objects into separate files, one file per keyword
get_osm_tags(osm_data)
column_to_dict(osm_data, osm_to_p31, 1, is_valid_wd)
populate_p31(osm_to_p31)
column_to_dict(osm_to_p31, p31_info, 1, is_valid_wd)
populate_p31info(p31_info)
extract_bad_data(osm_data, osm_to_p31, p31_info, True)
column_to_dict(osm_data, osm_to_p31, 1, is_valid_wd)
populate_p31(osm_to_p31)
column_to_dict(osm_to_p31, p31_info, 1, is_valid_wd)
populate_p31info(p31_info)
extract_bad_data(osm_data, osm_to_p31, p31_info)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment