-
-
Save h6ah4i/4a7c37b9e6114a671faf391cb1f909eb to your computer and use it in GitHub Desktop.
住所を上手く検出・分割するライブラリ。MITライセンス。https://qiita.com/ysrken/items/5371e67950425ace6a73
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'unf' | |
require "sqlite3" | |
class AddressSplitter | |
# コンストラクタ | |
def initialize | |
# DBを読み込む | |
@db = SQLite3::Database.new 'zenkoku.sqlite3' | |
# その他初期化 | |
@pref_list = [] | |
@city_list = {} | |
@town_list = {} | |
end | |
# sql(文字列)で@db(郵便番号DB)を検索し、結果を返す | |
def sql_execute(sql) | |
return @db.execute(sql) | |
end | |
# 都道府県の一覧を返す | |
def pref_list() | |
# SQLで都道府県一覧を生成する | |
# Enumerable#mapで1列目だけ取り出しているのがポイント | |
if @pref_list.size == 0 | |
@pref_list = sql_execute("SELECT ken_name from ad_address GROUP BY ken_name").map{|p| p[0]} | |
end | |
return @pref_list | |
end | |
# 市区の一覧を返す | |
def city_list(pref_name) | |
unless @city_list.has_key?(pref_name) | |
sql = "SELECT city_name FROM ad_address" | |
sql << " WHERE ken_name='#{pref_name}'" if pref_name != "" | |
sql << " GROUP BY city_name" | |
@city_list[pref_name] = sql_execute(sql).map{|p| p[0]} | |
end | |
return @city_list[pref_name] | |
end | |
# 町村の一覧を返す | |
def town_list(pref_name, city_name) | |
key = [pref_name, city_name] | |
unless @town_list.has_key?(key) | |
sql = "SELECT town_name FROM ad_address" | |
if pref_name != "" || city_name != "" | |
if pref_name != "" | |
sql << " WHERE ken_name='#{pref_name}'" | |
sql << " AND city_name='#{city_name}'" if city_name != "" | |
else | |
sql << " WHERE city_name='#{city_name}'" | |
end | |
end | |
sql << " GROUP BY town_name" | |
@town_list[key] = sql_execute(sql).map{|p| p[0]} | |
end | |
return @town_list[key] | |
end | |
# 不要な箇所を削除 | |
def sanitize(input_address) | |
# NKFC正規化しておく | |
address = UNF::Normalizer.normalize(input_address, :nfkc) | |
# 電話番号と思わしき文字列を削除 | |
address.gsub!(/0\d{1,4}-?\d{1,4}-?\d{4}/, '') | |
address.gsub!(/TEL:|FAX:|TEL|FAX/, '') | |
# 郵便番号と思わしき文字列を削除 | |
address.gsub!(/\d\d\d-\d\d\d\d/, '') | |
address.gsub!(/〒|郵便番号|郵便/, '') | |
# 括弧に囲われた部分を削除 | |
address.gsub!(/【.*?】/, '') | |
address.gsub!(/≪.*?≫/, '') | |
address.gsub!(/《.*?》/, '') | |
address.gsub!(/◎.*?◎/, '') | |
address.gsub!(/〔.*?〕/, '') | |
address.gsub!(/\[.*?\]/, '') | |
address.gsub!(/<.*?>/, '') | |
address.gsub!(/\(.*?\)/, '') | |
address.gsub!(/「.*?」/, '') | |
# 特定フレーズの後にある文字を削除 | |
address.gsub!(/(◎|※|☆|★|◇|◆|□|■|●|○|~|〜).*/, '') | |
return address | |
end | |
# 都道府県を検出する(既存の都道府県にマッチさせる) | |
def match_pref(address, norm_address) | |
# 注意例: 東京都府中市 | |
match_index = address.size # norm_address[:pref]がマッチした位置 | |
pref_list().each{|pref| | |
# prefがマッチした位置 | |
match_index_temp = address.index(pref) | |
# prefがマッチした際、 | |
# A. match_indexよりも手前の位置にマッチした場合 | |
# B. match_indexと同じ位置にマッチし、prefがnorm_address[:pref]より長い場合 | |
# のどちらかなら、norm_address[:pref]とmatch_indexを更新する | |
if !match_index_temp.nil? | |
if (match_index_temp < match_index) || \ | |
(match_index_temp == match_index && pref.size > norm_address[:pref].size) | |
norm_address[:pref] = pref | |
match_index = match_index_temp | |
mod_address = address[(match_index + pref.size)..-1] | |
end | |
end | |
} | |
return address, norm_address | |
end | |
# 都道府県を検出する(正規表現バージョン) | |
def match_pref_e(address, norm_address) | |
regex = /[^\x00-\x7F]{2,3}県|..府|東京都|北海道/ | |
if address =~ regex | |
norm_address[:pref] = address.match(regex)[0] | |
address = address.match(/#{norm_address[:pref]}(.*$)/)[1] | |
end | |
return address, norm_address | |
end | |
# 市区を検出する(既存の市区にマッチさせる) | |
def match_city(address, norm_address) | |
match_index = address.size # norm_address[:city]がマッチした位置 | |
mod_address = address | |
city_list(norm_address[:pref]).each{|city| | |
# cityがマッチした位置 | |
match_index_temp = address.index(city) | |
# 「ケ」「ヶ」「が」の表記ゆれ | |
if match_index_temp.nil? | |
match_index_temp = address.index(city.tr('ケヶ', 'ヶケ')) | |
end | |
if match_index_temp.nil? | |
match_index_temp = address.index(city.tr('ケヶ', 'がが')) | |
end | |
if match_index_temp.nil? | |
match_index_temp = address.index(city.tr('が', 'ヶ')) | |
end | |
if match_index_temp.nil? | |
match_index_temp = address.index(city.tr('がヶ', 'ケ')) | |
end | |
# cityがマッチした際、 | |
# A. match_indexよりも手前の位置にマッチした場合 | |
# B. match_indexと同じ位置にマッチし、cityがnorm_address[:city]より長い場合 | |
# のどちらかなら、norm_address[:city]とmatch_indexを更新する | |
if !match_index_temp.nil? | |
if (match_index_temp < match_index) || \ | |
(match_index_temp == match_index && city.size > norm_address[:city].size) | |
norm_address[:city] = city | |
match_index = match_index_temp | |
mod_address = address[(match_index + city.size)..-1] | |
end | |
end | |
} | |
return mod_address, norm_address | |
end | |
# 市区を検出する(正規表現バージョン) | |
def match_city_e(address, norm_address) | |
# マッチさせるパターンを配列で用意する | |
regex_pattern = [] | |
regex_pattern.push(/([^\x00-\x7F]{1,6}市[^\x00-\x7F]{1,4}区)/) | |
regex_pattern.push(/([^\x00-\x7F]{1,3}郡[^\x00-\x7F]{1,5}町)/) | |
regex_pattern.push(/(四日|廿日|野々)市市/) | |
regex_pattern.push(/([^\x00-\x7F市]{1,6}市)/) | |
regex_pattern.push(/([^\x00-\x7F]{1,4}区)/) | |
# 順に試していき、マッチしたものを判定結果とする | |
regex_pattern.each{|pattern| | |
# マッチしなければ飛ばす | |
next unless address =~ pattern | |
norm_address[:city] = address.match(pattern)[0] | |
address = address.match(/#{norm_address[:city]}(.*$)/)[1] | |
break | |
} | |
return address, norm_address | |
end | |
# 町村を検出する(既存の町村にマッチさせる) | |
def match_town(address, norm_address) | |
match_index = address.size # norm_address[:town]がマッチした位置 | |
mod_address = address | |
town_list(norm_address[:pref], norm_address[:city]).each{|town| | |
# townがマッチした位置 | |
match_index_temp = address.index(town) | |
# 「ケ」「ヶ」「が」の表記ゆれ | |
if match_index_temp.nil? | |
match_index_temp = address.index(town.tr('ケヶ', 'ヶケ')) | |
end | |
if match_index_temp.nil? | |
match_index_temp = address.index(town.tr('ケヶ', 'がが')) | |
end | |
if match_index_temp.nil? | |
match_index_temp = address.index(town.tr('が', 'ヶ')) | |
end | |
if match_index_temp.nil? | |
match_index_temp = address.index(town.tr('がヶ', 'ケ')) | |
end | |
# townがマッチした際、 | |
# A. match_indexよりも手前の位置にマッチした場合 | |
# B. match_indexと同じ位置にマッチし、townがnorm_address[:town]より長い場合 | |
# のどちらかなら、norm_address[:town]とmatch_indexを更新する | |
if !match_index_temp.nil? | |
if (match_index_temp < match_index) || \ | |
(match_index_temp == match_index && town.size > norm_address[:town].size) | |
norm_address[:town] = town | |
match_index = match_index_temp | |
mod_address = address[(match_index + town.size)..-1] | |
end | |
end | |
} | |
return mod_address, norm_address | |
end | |
# 番地を検出する | |
def match_addr1(address, norm_address) | |
#漢数字 | |
k_num = '[一二三四五六七八九十百千万]' | |
#繋ぎ文字1:数字と数字の間(末尾以外) | |
s_str1 = '(丁目|丁|番地|番|号|-|‐|ー|−|の|東|西|南|北)' | |
#繋ぎ文字2:数字と数字の間(末尾) | |
s_str2 = '(丁目|丁|番地|番|号)' | |
#全ての数字 | |
all_num = "(\\d+|#{k_num}+)" | |
#「先頭は数字、途中は数字か繋ぎ文字1、最後は数字か繋ぎ文字2」を満たす正規表現 | |
regex_pattern = /#{all_num}*(#{all_num}|#{s_str1}{1,2})*(#{all_num}|#{s_str2}{1,2})/ | |
# マッチングを行う | |
if address =~ regex_pattern | |
norm_address[:addr1] = address.match(regex_pattern)[0] | |
address = address.match(/#{norm_address[:addr1]}(.*$)/)[1] | |
end | |
return address, norm_address | |
end | |
# 番地を検出する(最長マッチングバージョン) | |
def match_addr1_old(address, norm_address) | |
#漢数字 | |
k_num = '[一二三四五六七八九十百千万]' | |
#繋ぎ文字1:数字と数字の間(末尾以外) | |
s_str1 = '(丁目|丁|番地|番|号|-|‐|ー|−|の|東|西|南|北)' | |
#繋ぎ文字2:数字と数字の間(末尾) | |
s_str2 = '(丁目|丁|番地|番|号)' | |
#全ての数字 | |
all_num = "(\\d+|#{k_num}+)" | |
#「先頭は数字、途中は数字か繋ぎ文字1、最後は数字か繋ぎ文字2」を満たす正規表現 | |
regex_pattern = /#{all_num}*(#{all_num}|#{s_str1}{1,2})*(#{all_num}|#{s_str2}{1,2})/ | |
# そもそもマッチしなければ検出する意味がない | |
return address, norm_address unless address =~ regex_pattern | |
# 番地候補を初期化 | |
addr1_list = [] | |
# 番地候補を追加していく | |
temp_address = address.clone #sub!メソッドで消していくので生贄を立てる | |
while temp_address =~ regex_pattern | |
addr1 = temp_address.match(regex_pattern)[0] | |
addr1_list.push(addr1) | |
temp_address.sub!(addr1, "") | |
end | |
# 最も長い番地候補が正しい番地だと思われる | |
norm_address[:addr1] = addr1_list.max{|a, b| a.size <=> b.size} | |
norm_address[:town] = address[0, address.index(norm_address[:addr1])] #ついでに町村も推定 | |
address = address.match(/#{norm_address[:addr1]}(.*$)/)[1] | |
return address, norm_address | |
end | |
# 番地を正規化する | |
def norm_addr1(addr1) | |
addr1_temp = addr1.clone | |
# ハイフン以外のハイフンっぽい記号を置き換える | |
addr1_temp.gsub!(/-|‐|ー|−/, '-') | |
# 「丁目」などをハイフンに置き換える | |
addr1_temp.gsub!(/丁目|丁|番地|番|号|の/, '-') | |
addr1_temp.gsub!(/-{2,}/, '-') | |
addr1_temp.gsub!(/(^-)|(-$)/, '') | |
# 漢数字をアラビア数字に置き換える | |
pattern = /[一二三四五六七八九十百千万]+/ | |
while addr1_temp =~ pattern | |
match_string = addr1_temp.match(pattern)[0] | |
arabia_number_string = "#{kan_to_arabia(match_string)}" | |
addr1_temp.sub!(match_string, arabia_number_string) | |
end | |
return addr1_temp | |
end | |
# 漢数字をアラビア数字に変換する | |
# 実は「十一万」以上の文字列で変換ミスが発生するが、 | |
# 番地変換でそこまで大きな数を考慮することはないと思われる | |
def kan_to_arabia(str) | |
# 変換するためのハッシュ | |
hash = { | |
"一" => 1, "二" => 2, "三" => 3, "四" => 4, "五" => 5, | |
"六" => 6, "七" => 7, "八" => 8, "九" => 9, "〇" => 0, | |
"十" => 10, "百" => 100, "千" => 1000, "万" => 10000 | |
} | |
# 漢数字を数字に置き換える | |
num_array = str.chars.to_a.map{|c| hash[c]} | |
# 10未満の数字を横方向に繋げる | |
# 例:[1,9,4,5]→[1945] | |
num_array2 = [] | |
temp = 0 | |
num_array.each{|num| | |
if num < 10 | |
temp *= 10 | |
temp += num | |
else | |
if temp != 0 | |
num_array2.push(temp) | |
else | |
num_array2.push(1) | |
end | |
num_array2.push(num) | |
temp = 0 | |
end | |
} | |
num_array2.push(temp) | |
# 10・100・1000・10000の直前にある数字とで積和する | |
# 例:[2,100,5,10,3]→253 | |
val = 0 | |
0.upto(num_array2.size / 2 - 1).each{|i| | |
val += num_array2[i * 2] * num_array2[i * 2 + 1] | |
} | |
val += num_array2.last | |
return val | |
end | |
# 建物名を正規化する | |
def norm_addr2(addr2) | |
addr2_temp = addr2.clone | |
# 括弧等は排除し、「○F」は「○階」と置き換える | |
addr2_temp = addr2_temp.gsub(/\(.*/,'').gsub(/(\d+)F/, '\1階') | |
# 「○階」「○号室」を含む場合、そこまでしか読み取らない | |
addr2_temp = addr2_temp.match(/^.*号室/)[0] if addr2_temp.include?('号室') | |
addr2_temp = addr2_temp.match(/^.*階/)[0] if addr2_temp.include?('階') | |
# 別の住所を含んでいる場合、その部分だけ削除する | |
pref_list().each{|pref| | |
if addr2_temp.include?(pref) | |
addr2_temp = addr2_temp[0, addr2_temp.index(pref)] | |
break | |
end | |
} | |
# 先頭・末尾の空白を削除する | |
addr2_temp = addr2_temp.sub(/^ +/, '').sub(/ +$/, '') | |
return addr2_temp | |
end | |
# 住所を検出・分割する | |
def split(input_address) | |
# 不要な箇所を削除 | |
address = sanitize(input_address) | |
# 分割後の住所を格納する連想配列を用意 | |
norm_address = {:pref => "", :city => "", :town => "", :addr1 => "", :addr2 => ""} | |
# 都道府県を検出する(既存の都道府県にマッチさせる) | |
address, norm_address = match_pref(address, norm_address) | |
#address, norm_address = match_pref_e(address, norm_address) | |
# 市区を検出する(既存の市区にマッチさせる) | |
address, norm_address = match_city(address, norm_address) | |
#address, norm_address = match_city_e(address, norm_address) | |
# 町村を検出する(既存の町村にマッチさせる) | |
address, norm_address = match_town(address, norm_address) | |
# 番地を検出する | |
address, norm_address = match_addr1(address, norm_address) | |
#address, norm_address = match_addr1_old(address, norm_address) | |
# 番地を正規化する | |
norm_address[:addr1] = norm_addr1(norm_address[:addr1]) | |
# 建物名を正規化する | |
norm_address[:addr2] = address | |
norm_address[:addr2] = norm_addr2(norm_address[:addr2]) | |
return norm_address | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# サンプルコード | |
Encoding.default_external = "UTF-8" | |
# サンプル入力 | |
address = "住所:〒105-0011 東京都港区芝公園4丁目2-8東京タワー大展望台2F TEL:03-3433-5111" | |
# インスタンスを生成 | |
as = AddressSplitter.new | |
# 分割処理 | |
norm_address = as.split(address) | |
# 結果を出力 | |
p address | |
p norm_address |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment