Created
June 29, 2020 16:42
-
-
Save Gerenuk/d7fee2ebe3cfc1c945e86c5e1d75d572 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import unicodedata | |
STATE_FIND_STREETNAME = 1 | |
STATE_FIND_FIRST_NUMBER = 2 | |
STATE_FIND_SECOND_NUMBER = 3 | |
STATE_ALL_FOUND = 4 | |
def normalize_street( | |
street, | |
regex=re.compile("[a-zß]+|[0-9]+"), # relevant after unicode normalization | |
MAX_HAUSNUMMER=10000, | |
MAX_HAUSNUMMER_DIFF=2, | |
LETTERS_AFTER_HAUSNUMMER="abc", | |
replace=None, | |
): | |
""" | |
Tokenizes by above regex. Then expects: | |
Alpha+ ((Num1 Letter?) (Num2 Letter?)?)? | |
Input is removed from accents | |
Consider replacing ä->ae, straße->str etc. maybe | |
>>> Abc dßäöüf ghi a21 b - 23 a | |
('abc dßaouf ghi a', {21, 22, 23}, True) | |
""" | |
if not street: # leer | |
return "", set() | |
x = street | |
if replace is not None: | |
for old, new in replace.items(): | |
x = x.replace(old, new) | |
x = x.lower() | |
x = unicodedata.normalize("NFKD", x) # remove accents | |
x = "".join(c for c in x if not unicodedata.combining(c)) # remove accents | |
parts = street_regex.findall(x) | |
street_name_parts = [] | |
number1 = None | |
number2 = None | |
clean_match = True | |
try: | |
parts_iter = iter(parts) | |
state = STATE_FIND_STREETNAME | |
part = next(parts_iter) | |
while 1: | |
if state == STATE_FIND_STREETNAME: | |
if part.isalpha(): | |
street_name_parts.append(part) | |
part = next(parts_iter) | |
else: | |
state = STATE_FIND_FIRST_NUMBER | |
elif state == STATE_FIND_FIRST_NUMBER: | |
if part.isdecimal(): | |
number1 = int(part) | |
part = next(parts_iter) | |
if part in LETTERS_AFTER_HAUSNUMMER: | |
part = next(parts_iter) | |
state = STATE_FIND_SECOND_NUMBER | |
else: | |
raise ValueError(f"Sollte Nummer sein: {part} in {street}") | |
elif state == STATE_FIND_SECOND_NUMBER: | |
if part.isdecimal(): | |
number2 = int(part) | |
part = next(parts_iter) | |
if part in LETTERS_AFTER_HAUSNUMMER: | |
part = next(parts_iter) | |
state = STATE_ALL_FOUND | |
elif state == STATE_ALL_FOUND: | |
print( | |
f"Mehr Text '{part}' nachdem Straße/Hausnummer schon erkannt: {street}" | |
) | |
clean_match = False | |
raise StopIteration() | |
else: | |
assert 0, "Illegal state" | |
except StopIteration: | |
pass | |
if number1 is not None and not (0 < number1 <= MAX_HAUSNUMMER): | |
print(f"Illegale Hausnummer {number1} in '{street}'") | |
number1 = None | |
if number2 is not None and not (0 < number2 <= MAX_HAUSNUMMER): | |
print(f"Illegale Hausnummer {number2} in '{street}'") | |
number2 = None | |
if number1 is not None: | |
if number2 is not None: | |
if number1 < number2 and number2 - number1 <= MAX_HAUSNUMMER_DIFF: | |
numbers = set(range(number1, number2 + 1)) | |
else: | |
print(f"Keine etwas ansteigenden Nummern in '{street}'") | |
numbers = {number1} | |
clean_match = False | |
else: | |
numbers = {number1} | |
else: | |
numbers=set() | |
norm_street = " ".join(street_name_parts) | |
if not clean_match: # Usually for ambigious normalizations | |
print( | |
f"Fallback normalization: '{street}' --> '{norm_street} {','.join(map(str, sorted(numbers)))}'" | |
) | |
return norm_street, numbers, clean_match |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment