Last active
September 23, 2020 13:01
-
-
Save kzinmr/871beb869a92002732f907219b05abfb to your computer and use it in GitHub Desktop.
AddressMatcher for the Japanese addressing system, using https://geolonia.github.io/japanese-addresses/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import regex as re | |
import pandas as pd | |
class AddressMatcher: | |
def __init__(self): | |
self.pref_city_addr_matcher = self.geolonia_dataset() | |
@staticmethod | |
def geolonia_dataset(): | |
url="https://raw.githubusercontent.com/geolonia/japanese-addresses/master/data/latest.csv" | |
address_df = pd.read_csv(url) | |
pref_city_addr = address_df[['都道府県名', '市区町村名', '大字町丁目名']] | |
pref_city_addr_matcher = { | |
pref: {city: set(addr['大字町丁目名']) for city, addr in city_addr.set_index('市区町村名').groupby(level=0)} | |
for pref, city_addr in pref_city_addr.set_index('都道府県名').groupby(level=0) | |
} | |
return pref_city_addr_matcher | |
@staticmethod | |
def ksuji(match): | |
tt_ksuji = str.maketrans('123456789', '一二三四五六七八九') | |
if match is not None: | |
ks = match.group(1) | |
return f'{ks.translate(tt_ksuji)}丁目' | |
else: | |
return None | |
def match_address(self, text, partial=True): | |
text = re.sub('([1-9])丁目', self.ksuji, text) # unicodedata.normalize('NFKC', text) | |
for pref in self.pref_city_addr_matcher: | |
for m in re.finditer(pref, text): | |
if m is not None: | |
begin, end = m.span() | |
pref_text = text[end:] | |
for city in self.pref_city_addr_matcher[pref]: | |
if pref_text.startswith(city): | |
city_text = pref_text[len(city):] | |
for addr in self.pref_city_addr_matcher[pref][city]: | |
if city_text.startswith(addr): | |
addr_span = (begin, end + len(city) + len(addr)) | |
yield addr_span | |
break | |
else: | |
if partial: | |
addr_partial = {re.sub('[一二三四五六七八九]丁目', '', addr) for addr in self.pref_city_addr_matcher[pref][city]} | |
for addr in addr_partial: | |
if city_text.startswith(addr): | |
addr_span = (begin, end + len(city) + len(addr)) | |
yield addr_span | |
@staticmethod | |
def adjunct_address(text, spans, margin=10): | |
address_numbers = re.compile( | |
r"^\p{N}+-\p{N}+-\p{N}+|" | |
"^\p{N}+-\p{N}+|" | |
"^\p{N}+丁目\p{N}+番地?\p{N}+号?|" | |
"^\p{N}+丁目\p{N}+|" | |
"^\p{N}+番地?\p{N}+号?|" | |
"^\p{N}+番地?|" | |
"^(\p{N}+)[^\-]" | |
) | |
for b, e in spans: | |
text_fragment = text[b:e+margin] | |
m = address_numbers.search(text_fragment[e-b:]) | |
if m: | |
match = m.group() if m.group(1) is None else m.group(1) | |
yield (b, e+len(match)) | |
else: | |
yield (b, e) | |
def match_spans(self, text): | |
return list(self.adjunct_address(text, list(self.match_address(text)))) | |
def match_text_spans(self, texts, filter_empty=True): | |
text_spans = [(text, self.match_spans(text)) for text in texts] | |
if filter_empty: | |
return [(t, ss) for t, ss in text_spans if len(ss) > 0] | |
else: | |
return text_spans_full | |
if __name__=='__main__': | |
import tensorflow_datasets as tfds | |
dataset, dataset_info = tfds.load( | |
name='wikipedia/20190301.ja', | |
data_dir='tmp', | |
with_info=True, | |
split=tfds.Split.TRAIN, | |
) | |
am = AddressMatcher() | |
texts = [example['text'].numpy().decode('utf-8') for example in dataset.take(100)] | |
text_spans_full = am.match_text_spans(texts) | |
print(text_spans_full) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment