Skip to content

Instantly share code, notes, and snippets.

@RavuAlHemio
Created September 4, 2013 20:57
Show Gist options
  • Save RavuAlHemio/6442776 to your computer and use it in GitHub Desktop.
Save RavuAlHemio/6442776 to your computer and use it in GitHub Desktop.
OSM-XML duplicate address detector
#!/usr/bin/env python3
#
# Detects duplicate address specifications (addr:street, addr:housenumber
# and possibly addr:unit) in an OSM XML file.
#
# Released into the public domain.
from xml.dom.minidom import parseString as domParse
import xml.dom as dom
def _fishOutAddressTags(nd):
ret = {}
for subnd in nd.childNodes:
if subnd.nodeType != subnd.ELEMENT_NODE:
continue
if subnd.tagName == "tag":
k = subnd.getAttribute("k")
if k.startswith("addr:"):
ret[k] = subnd.getAttribute("v")
return ret
def fishOutAddressData(xmlstring):
ret = {}
d = domParse(xmlstring)
for nd in d.documentElement.childNodes:
if nd.nodeType != nd.ELEMENT_NODE:
continue
if nd.tagName == "node":
myid = "n" + nd.getAttribute("id")
elif nd.tagName == "way":
myid = "w" + nd.getAttribute("id")
elif nd.tagName == "relation":
myid = "r" + nd.getAttribute("id")
else:
continue
if nd.getAttribute("action") == "delete":
# ne'er mind
continue
myaddrs = _fishOutAddressTags(nd)
if len(myaddrs) > 0:
if myid in ret:
raise ValueError("duplicate ID " + myid)
ret[myid] = myaddrs
return ret
def pairwise(iterable, f):
l = list(iterable)
for na in range(len(l)):
for nb in range(na+1, len(l)):
f(l[na], l[nb])
def prettyAddr(addr):
ret = "{0} {1}".format(addr["addr:street"], addr["addr:housenumber"])
if "addr:unit" in addr:
ret += "/" + addr["addr:unit"]
return ret
def validateAddr(eid, addr):
if "addr:street" not in addr:
raise ValueError(eid + " has no street")
if "addr:housenumber" not in addr:
raise ValueError(eid + " has no house number")
def isAddrEq(a, b):
(aid, aaddr) = a
(bid, baddr) = b
validateAddr(*a)
validateAddr(*b)
if aaddr["addr:street"] != baddr["addr:street"]:
return False
if aaddr["addr:housenumber"] != baddr["addr:housenumber"]:
return False
if "addr:unit" not in aaddr:
if "addr:unit" in baddr:
raise ValueError("{0} mostly equals to {1} but {0} has no unit while {1} has".format(aid, bid))
return True
elif "addr:unit" not in baddr:
if "addr:unit" in aaddr:
raise ValueError("{0} mostly equals to {1} but {0} has a unit while {1} has not".format(aid, bid))
return True
else:
return (aaddr["addr:unit"] == baddr["addr:unit"])
def raiseIfAddrEq(a, b):
if isAddrEq(a, b):
raise ValueError("{0}'s address ({2}) is equal to {1}'s address ({3})".format(a[0], b[0], prettyAddr(a[1]), prettyAddr(b[1])))
if __name__ == '__main__':
from sys import argv
ad = {}
with open(argv[1], "r") as f:
ad = fishOutAddressData(f.read())
pairwise(ad.items(), raiseIfAddrEq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment