Created
March 22, 2017 22:27
-
-
Save danstowell/f2e05428ec62aafa77f4b24693a1a4d5 to your computer and use it in GitHub Desktop.
Script to build Wikidata -> OpenStreetMap lookup table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys, re | |
from datetime import datetime | |
from imposm.parser import OSMParser | |
######################################################## | |
#osmsourcelbl = 'greater-london' | |
#osmsourcelbl = 'great-britain' | |
osmsourcelbl = 'planet' | |
osmsourcefpath = os.path.expanduser('~/osm/%s-latest.osm.pbf' % osmsourcelbl) | |
######################################################## | |
wkddatum_format_matcher = re.compile(r"^[qQ]\d+$") | |
class WkdRefScan(object): | |
"Class to parse OSM data file and build up a map from wkd identifiers to osm objects" | |
wikidatas = 0 | |
granddict = {} | |
def osmobjs(self, osmtype, items): | |
granddict = self.granddict | |
for osmid, tags, otherstuff in items: | |
if 'wikidata' in tags: | |
self.wikidatas += 1 | |
wkdrefs = set() | |
for wkddatum in tags['wikidata'].split(';'): | |
wkddatum = wkddatum.strip() | |
# if garbage (meaning not expected format [qQ]\d+), skip | |
if wkddatum_format_matcher.match(wkddatum) is None: | |
print("Malformed wikidata string: %s -- in http://www.openstreetmap.org/%s/%i" % (wkddatum, osmtype, osmid)) | |
continue | |
wkddatum = u'Q' + wkddatum[1:] # normalise q to Q | |
#print wkddatum | |
wkdrefs.add(wkddatum) | |
for wkddatum in wkdrefs: | |
if wkddatum not in granddict: | |
granddict[wkddatum] = [] | |
granddict[wkddatum].append((osmtype, osmid)) | |
def ways(self, items): | |
self.osmobjs('way', items) | |
def nodes(self, items): | |
self.osmobjs('node', items) | |
def relations(self, items): | |
self.osmobjs('relation', items) | |
def write(self, pathstem): | |
"Write out big JSON and CSV files containing the whole wkd->osm lookup data" | |
with open("%s.csv" % pathstem, 'wb') as csvfp: | |
with open("%s.json" % pathstem, 'wb') as jsonfp: | |
jsonfp.write("{\n") | |
firsteverentry = True | |
for wkd, osmobjlist in sorted(self.granddict.items()): | |
# write beginning of WKD item | |
csvfp.write("%s" % (wkd)) | |
jsonfp.write('%s"%s": [' % ([',',''][firsteverentry], wkd)) | |
for objindex, (osmtype, osmid) in enumerate(sorted(osmobjlist)): | |
csvfp.write(",%s/%i" % (osmtype, osmid)) | |
jsonfp.write('%s["%s",%i]' % ([', ', ''][objindex==0], osmtype, osmid)) | |
# write end of WKD item | |
csvfp.write("\n") | |
jsonfp.write("]\n") | |
firsteverentry = False | |
jsonfp.write("}\n") | |
######################################################## | |
if __name__ == '__main__': | |
refobj = WkdRefScan() | |
p = OSMParser(concurrency=4, ways_callback=refobj.ways, nodes_callback=refobj.nodes, relations_callback=refobj.relations) | |
startTime = datetime.now() | |
print("Beginning parsing. Time: %s" % startTime) | |
p.parse(osmsourcefpath) | |
endtime = datetime.now() | |
print("Time taken: %s" % (endtime - startTime,)) | |
print("Number of wikidata tags encountered: %i" % refobj.wikidatas) | |
refobj.write("output/wkdosm-gb") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please add a license header :^)