Created
September 4, 2013 20:57
-
-
Save RavuAlHemio/6442776 to your computer and use it in GitHub Desktop.
OSM-XML duplicate address detector
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Detects duplicate address specifications (addr:street, addr:housenumber | |
# and possibly addr:unit) in an OSM XML file. | |
# | |
# Released into the public domain. | |
from xml.dom.minidom import parseString as domParse | |
import xml.dom as dom | |
def _fishOutAddressTags(nd): | |
ret = {} | |
for subnd in nd.childNodes: | |
if subnd.nodeType != subnd.ELEMENT_NODE: | |
continue | |
if subnd.tagName == "tag": | |
k = subnd.getAttribute("k") | |
if k.startswith("addr:"): | |
ret[k] = subnd.getAttribute("v") | |
return ret | |
def fishOutAddressData(xmlstring): | |
ret = {} | |
d = domParse(xmlstring) | |
for nd in d.documentElement.childNodes: | |
if nd.nodeType != nd.ELEMENT_NODE: | |
continue | |
if nd.tagName == "node": | |
myid = "n" + nd.getAttribute("id") | |
elif nd.tagName == "way": | |
myid = "w" + nd.getAttribute("id") | |
elif nd.tagName == "relation": | |
myid = "r" + nd.getAttribute("id") | |
else: | |
continue | |
if nd.getAttribute("action") == "delete": | |
# ne'er mind | |
continue | |
myaddrs = _fishOutAddressTags(nd) | |
if len(myaddrs) > 0: | |
if myid in ret: | |
raise ValueError("duplicate ID " + myid) | |
ret[myid] = myaddrs | |
return ret | |
def pairwise(iterable, f): | |
l = list(iterable) | |
for na in range(len(l)): | |
for nb in range(na+1, len(l)): | |
f(l[na], l[nb]) | |
def prettyAddr(addr): | |
ret = "{0} {1}".format(addr["addr:street"], addr["addr:housenumber"]) | |
if "addr:unit" in addr: | |
ret += "/" + addr["addr:unit"] | |
return ret | |
def validateAddr(eid, addr): | |
if "addr:street" not in addr: | |
raise ValueError(eid + " has no street") | |
if "addr:housenumber" not in addr: | |
raise ValueError(eid + " has no house number") | |
def isAddrEq(a, b): | |
(aid, aaddr) = a | |
(bid, baddr) = b | |
validateAddr(*a) | |
validateAddr(*b) | |
if aaddr["addr:street"] != baddr["addr:street"]: | |
return False | |
if aaddr["addr:housenumber"] != baddr["addr:housenumber"]: | |
return False | |
if "addr:unit" not in aaddr: | |
if "addr:unit" in baddr: | |
raise ValueError("{0} mostly equals to {1} but {0} has no unit while {1} has".format(aid, bid)) | |
return True | |
elif "addr:unit" not in baddr: | |
if "addr:unit" in aaddr: | |
raise ValueError("{0} mostly equals to {1} but {0} has a unit while {1} has not".format(aid, bid)) | |
return True | |
else: | |
return (aaddr["addr:unit"] == baddr["addr:unit"]) | |
def raiseIfAddrEq(a, b): | |
if isAddrEq(a, b): | |
raise ValueError("{0}'s address ({2}) is equal to {1}'s address ({3})".format(a[0], b[0], prettyAddr(a[1]), prettyAddr(b[1]))) | |
if __name__ == '__main__': | |
from sys import argv | |
ad = {} | |
with open(argv[1], "r") as f: | |
ad = fishOutAddressData(f.read()) | |
pairwise(ad.items(), raiseIfAddrEq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment