danstowell · March 22, 2017 22:27 · nicolasmaia · Mar 23, 2017
diff --git a/wikidata_scan_osm.py b/wikidata_scan_osm.py
 import os, sys, re
 from datetime import datetime
 from imposm.parser import OSMParser

 ########################################################

 #osmsourcelbl = 'greater-london'
 #osmsourcelbl = 'great-britain'
 osmsourcelbl = 'planet'

 osmsourcefpath = os.path.expanduser('~/osm/%s-latest.osm.pbf' % osmsourcelbl)

 ########################################################

 wkddatum_format_matcher = re.compile(r"^[qQ]\d+$")

 class WkdRefScan(object):
 	"Class to parse OSM data file and build up a map from wkd identifiers to osm objects"
 	wikidatas = 0
 	granddict = {}

 	def osmobjs(self, osmtype, items):
 		granddict = self.granddict
 		for osmid, tags, otherstuff in items:
 			if 'wikidata' in tags:
 				self.wikidatas += 1
 				wkdrefs = set()
 				for wkddatum in tags['wikidata'].split(';'):
 					wkddatum = wkddatum.strip()
 					# if garbage (meaning not expected format [qQ]\d+), skip
 					if wkddatum_format_matcher.match(wkddatum) is None:
 						print("Malformed wikidata string: %s -- in http://www.openstreetmap.org/%s/%i" % (wkddatum, osmtype, osmid))
 						continue
 					wkddatum = u'Q' + wkddatum[1:]  # normalise q to Q
 					#print wkddatum
 					wkdrefs.add(wkddatum)
 				for wkddatum in wkdrefs:
 					if wkddatum not in granddict:
 						granddict[wkddatum] = []
 					granddict[wkddatum].append((osmtype, osmid))

 	def ways(self, items):
 		self.osmobjs('way', items)
 	def nodes(self, items):
 		self.osmobjs('node', items)
 	def relations(self, items):
 		self.osmobjs('relation', items)

 	def write(self, pathstem):
 		"Write out big JSON and CSV files containing the whole wkd->osm lookup data"
 		with open("%s.csv" % pathstem, 'wb') as csvfp:
 			with open("%s.json" % pathstem, 'wb') as jsonfp:
 				jsonfp.write("{\n")
 				firsteverentry = True
 				for wkd, osmobjlist in sorted(self.granddict.items()):
 					# write beginning of WKD item
 					csvfp.write("%s" % (wkd))
 					jsonfp.write('%s"%s": [' % ([',',''][firsteverentry], wkd))
 					for objindex, (osmtype, osmid) in enumerate(sorted(osmobjlist)):
 						csvfp.write(",%s/%i" % (osmtype, osmid))
 						jsonfp.write('%s["%s",%i]' % ([', ', ''][objindex==0], osmtype, osmid))
 					# write end of WKD item
 					csvfp.write("\n")
 					jsonfp.write("]\n")
 					firsteverentry = False
 				jsonfp.write("}\n")

 ########################################################
 if __name__ == '__main__':
 	refobj = WkdRefScan()
 	p = OSMParser(concurrency=4, ways_callback=refobj.ways, nodes_callback=refobj.nodes, relations_callback=refobj.relations)
 	startTime = datetime.now()
 	print("Beginning parsing. Time: %s" % startTime)
 	p.parse(osmsourcefpath)
 	endtime = datetime.now()
 	print("Time taken: %s" % (endtime - startTime,))
 	print("Number of wikidata tags encountered: %i" % refobj.wikidatas)
 	refobj.write("output/wkdosm-gb")
	import os, sys, re
	from datetime import datetime
	from imposm.parser import OSMParser

	########################################################

	#osmsourcelbl = 'greater-london'
	#osmsourcelbl = 'great-britain'
	osmsourcelbl = 'planet'

	osmsourcefpath = os.path.expanduser('~/osm/%s-latest.osm.pbf' % osmsourcelbl)

	########################################################

	wkddatum_format_matcher = re.compile(r"^[qQ]\d+$")

	class WkdRefScan(object):
	"Class to parse OSM data file and build up a map from wkd identifiers to osm objects"
	wikidatas = 0
	granddict = {}

	def osmobjs(self, osmtype, items):
	granddict = self.granddict
	for osmid, tags, otherstuff in items:
	if 'wikidata' in tags:
	self.wikidatas += 1
	wkdrefs = set()
	for wkddatum in tags['wikidata'].split(';'):
	wkddatum = wkddatum.strip()
	# if garbage (meaning not expected format [qQ]\d+), skip
	if wkddatum_format_matcher.match(wkddatum) is None:
	print("Malformed wikidata string: %s -- in http://www.openstreetmap.org/%s/%i" % (wkddatum, osmtype, osmid))
	continue
	wkddatum = u'Q' + wkddatum[1:] # normalise q to Q
	#print wkddatum
	wkdrefs.add(wkddatum)
	for wkddatum in wkdrefs:
	if wkddatum not in granddict:
	granddict[wkddatum] = []
	granddict[wkddatum].append((osmtype, osmid))

	def ways(self, items):
	self.osmobjs('way', items)
	def nodes(self, items):
	self.osmobjs('node', items)
	def relations(self, items):
	self.osmobjs('relation', items)

	def write(self, pathstem):
	"Write out big JSON and CSV files containing the whole wkd->osm lookup data"
	with open("%s.csv" % pathstem, 'wb') as csvfp:
	with open("%s.json" % pathstem, 'wb') as jsonfp:
	jsonfp.write("{\n")
	firsteverentry = True
	for wkd, osmobjlist in sorted(self.granddict.items()):
	# write beginning of WKD item
	csvfp.write("%s" % (wkd))
	jsonfp.write('%s"%s": [' % ([',',''][firsteverentry], wkd))
	for objindex, (osmtype, osmid) in enumerate(sorted(osmobjlist)):
	csvfp.write(",%s/%i" % (osmtype, osmid))
	jsonfp.write('%s["%s",%i]' % ([', ', ''][objindex==0], osmtype, osmid))
	# write end of WKD item
	csvfp.write("\n")
	jsonfp.write("]\n")
	firsteverentry = False
	jsonfp.write("}\n")

	########################################################
	if __name__ == '__main__':
	refobj = WkdRefScan()
	p = OSMParser(concurrency=4, ways_callback=refobj.ways, nodes_callback=refobj.nodes, relations_callback=refobj.relations)
	startTime = datetime.now()
	print("Beginning parsing. Time: %s" % startTime)
	p.parse(osmsourcefpath)
	endtime = datetime.now()
	print("Time taken: %s" % (endtime - startTime,))
	print("Number of wikidata tags encountered: %i" % refobj.wikidatas)
	refobj.write("output/wkdosm-gb")