Last active
August 29, 2015 14:17
-
-
Save kimoto/759771dd77ee9dd18fc0 to your computer and use it in GitHub Desktop.
Normalizer for mecab-neologd
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'moji' | |
def normalize_neologd(norm) | |
norm.tr!("0-9A-Za-z", "0-9A-Za-z") | |
norm = Moji.han_to_zen(norm, Moji::HAN_KATA) | |
hypon_reg = /(?:˗|֊|‐|‑|‒|–|⁃|⁻|₋|−)/ | |
norm.gsub!(hypon_reg, "-") | |
choon_reg = /(?:﹣|-|ー|—|―|─|━)/ | |
norm.gsub!(choon_reg, "ー") | |
chil_reg = /(?:~|∼|∾|〜|〰|~)/ | |
norm.gsub!(chil_reg, '') | |
norm.gsub!(/[ー]+/, "ー") | |
norm.tr!(%q{!"#$%&'()*+,-.\/:;<=>?@[\]^_`{|}~。、・「」"}, %q{!”#$%&’()*+,−./:;<=>?@[¥]^_`{|}〜。、・「」}) | |
norm.gsub!(/ /, " ") | |
norm.gsub!(/ {1,}/, " ") | |
norm.gsub!(/^[ ]+(.+?)$/, "\\1") | |
norm.gsub!(/^(.+?)[ ]+$/, "\\1") | |
while norm =~ %r{([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)[ ]{1}([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)} | |
norm.gsub!( %r{([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)[ ]{1}([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)}, "\\1\\2") | |
end | |
while norm =~ %r{([\p{InBasicLatin}]+)[ ]{1}([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)} | |
norm.gsub!(%r{([\p{InBasicLatin}]+)[ ]{1}([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)}, "\\1\\2") | |
end | |
while norm =~ %r{([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)[ ]{1}([\p{InBasicLatin}]+)} | |
norm.gsub!(%r{([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)[ ]{1}([\p{InBasicLatin}]+)}, "\\1\\2") | |
end | |
norm.tr!( | |
%q{!”#$%&’()*+,−./:;<>?@[¥]^_`{|}〜}, | |
%q{!"#$%&'()*+,-.\/:;<>?@[\]^_`{|}~} | |
) | |
norm | |
end | |
if $0 == __FILE__ | |
def assert(expect, actual) | |
if expect == actual | |
true | |
else | |
raise "Failed: Want #{expect.inspect} but #{actual.inspect}" | |
end | |
end | |
assert "0", normalize_neologd("0") | |
assert "ハンカク", normalize_neologd("ハンカク") | |
assert "o-o", normalize_neologd("o₋o") | |
assert "majikaー", normalize_neologd("majika━") | |
assert "わい", normalize_neologd("わ〰い") | |
assert "スーパー", normalize_neologd("スーパーーーー") | |
assert "!#", normalize_neologd("!#") | |
assert "ゼンカクスペース", normalize_neologd("ゼンカク スペース") | |
assert "おお", normalize_neologd("お お") | |
assert "おお", normalize_neologd(" おお") | |
assert "おお", normalize_neologd("おお ") | |
assert "検索エンジン自作入門を買いました!!!", normalize_neologd("検索 エンジン 自作 入門 を 買い ました!!!") | |
assert "アルゴリズムC", normalize_neologd("アルゴリズム C") | |
assert "PRML副読本", normalize_neologd(" PRML 副 読 本 ") | |
assert "Coding the Matrix", normalize_neologd("Coding the Matrix") | |
assert "南アルプスの天然水Sparking Lemonレモン一絞り", normalize_neologd("南アルプスの 天然水 Sparking Lemon レモン一絞り") | |
assert "南アルプスの天然水- Sparking*Lemon+レモン一絞り", normalize_neologd("南アルプスの 天然水- Sparking* Lemon+ レモン一絞り") | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment