Skip to content

Instantly share code, notes, and snippets.

@kawasima
Created July 1, 2015 08:02
Show Gist options
  • Save kawasima/7302a133b324dfbb37c0 to your computer and use it in GitHub Desktop.
Save kawasima/7302a133b324dfbb37c0 to your computer and use it in GitHub Desktop.
Kuromojiを漢字→フリガナに使ってみるテスト
import static org.hamcrest.CoreMatchers.*;
import static org.junit.Assert.*;
import org.junit.Test;
import java.io.IOException;
/**
* @author kawasima
*/
public class FuriganizerTest {
@Test
public void furiganaizeAddress() throws IOException {
assertThat(Furiganizer.furiganaize("東京都千代田区千代田1番江戸城マンション"),
is("トウキョウトチヨダクチヨダ1バンエドジョウマンション"));
}
@Test
public void furiganaShimei() throws IOException {
assertThat(Furiganizer.furiganaize("上島竜兵"),
is("ウエシマリュウヘイ"));
assertThat(Furiganizer.furiganaize("寺門ジモン"),
is("テラカドジモン"));
assertThat(Furiganizer.furiganaize("肥後克広"),
is("ヒゴカツヒロ"));
assertThat(Furiganizer.furiganaize("安倍晋三"),
is("アベススムサン")); // neologdだとOK
}
}
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
/**
* @author kawasima
*/
public class Furiganizer
{
private static final String[] stoptags = {
"記号-一般", "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット"
};
public static String furiganaize(String kanjiText) throws IOException {
JapaneseAnalyzer analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.Mode.NORMAL,
CharArraySet.EMPTY_SET,
new HashSet(Arrays.asList(stoptags)));
StringBuilder sb = new StringBuilder(512);
TokenStream stream = analyzer.tokenStream("", kanjiText);
stream.reset();
while (stream.incrementToken()) {
ReadingAttribute readingAttribute = stream.getAttribute(ReadingAttribute.class);
CharTermAttribute charTermAttribute = stream.getAttribute(CharTermAttribute.class);
String kana = readingAttribute.getReading();
if (kana == null) {
kana = charTermAttribute.toString();
}
sb.append(kana);
}
stream.close();
return sb.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment