Last active
August 29, 2015 14:20
-
-
Save timrae/e9efadbfe59a8473389b to your computer and use it in GitHub Desktop.
Convert a kanji and its reading into furigana in the format which Anki expects for Ruby text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/**************************************************************************************** | |
* Copyright (c) 2015, Timothy Rae * | |
* All rights reserved. * | |
* * | |
* Redistribution and use in source and binary forms, with or without * | |
* modification, are permitted provided that the following conditions are met: * | |
* * Redistributions of source code must retain the above copyright * | |
* notice, this list of conditions and the following disclaimer. * | |
* * Redistributions in binary form must reproduce the above copyright * | |
* notice, this list of conditions and the following disclaimer in the * | |
* documentation and/or other materials provided with the distribution. * | |
* * Neither the name of the copyright holders nor the * | |
* names of its contributors may be used to endorse or promote products * | |
* derived from this software without specific prior written permission. * | |
* * | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * | |
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * | |
****************************************************************************************/ | |
public class FuriganaTools { | |
private static final Pattern KANA_REGEXP = Pattern.compile("[\u3041-\u309e\uff66-\uff9d\u30a1-\u30fe]+"); | |
private static final String RUBY = " %s[%s]"; | |
/** | |
* Add the reading to a kanji as Ruby furigana in the Anki format, | |
* ensuring that there is only furigana above the kanji, | |
* not above any of the hiragana included in the word. | |
* | |
* Example: ("振り返る", "ふりかえる") -> "振[ふ]り 返[かえ]る" | |
* | |
* @param kanji the word to which furigana should be applied | |
* @param reading the hiragana reading corresponding to the kanji word | |
* @return the furigana corresponding the input parameters | |
*/ | |
public static String makeFurigana(String kanji, String reading) { | |
Matcher kanaMatcher = KANA_REGEXP.matcher(kanji); | |
// All characeters are kanji; simple replacement will work | |
if (!kanaMatcher.find()) { | |
return String.format(RUBY, kanji, reading); | |
} | |
// Strip off any kana from the beginning of the word | |
StringBuilder output = new StringBuilder(); | |
if (kanaMatcher.start() == 0) { | |
String prefix = kanaMatcher.group(); | |
kanji = kanji.substring(prefix.length()); | |
reading = reading.substring(prefix.length()); | |
output.append(prefix); | |
kanaMatcher = KANA_REGEXP.matcher(kanji); | |
} else { | |
kanaMatcher.reset(); | |
} | |
// Keep track of number of kana added to output to see if the algorithm was successful | |
int numKana = output.length(); | |
// Now step through each kanji | |
int lastKanaEnd = 0; | |
int lastReadingKanaEnd = 0; | |
while (kanaMatcher.find()) { | |
// Find the next kana in the kanji string | |
int kanaStart = kanaMatcher.start(); | |
String currentKana = kanaMatcher.group(); | |
// Extract the kanji in-between the current kana and the previous kana | |
String currentKanji = kanji.substring(lastKanaEnd, kanaStart); | |
// Set the end index of current kana in kanji string for next loop iteration | |
lastKanaEnd = kanaMatcher.end(); | |
// Find the current kana in the reading string | |
// Not perfect. Here we take the first occurrence at least number of kanji after the last kana | |
int readingKanaStart = reading.indexOf(currentKana, lastReadingKanaEnd + currentKanji.length()); | |
// Extract the reading in-between the kana found in the kanji this time and last time | |
String currentReading = reading.substring(lastReadingKanaEnd, readingKanaStart); | |
// Set the end index of current kana in reading string for next loop iteration | |
lastReadingKanaEnd = readingKanaStart + currentKana.length(); | |
// Append current kanji and reading to the StringBuilder as furigana | |
output.append(String.format(RUBY, currentKanji, currentReading)); | |
// Append the current kana to the StringBuilder (outside the furigana) | |
output.append(currentKana); | |
// Keep track of number of kana addded to see if the algorithm was successful | |
numKana += currentReading.length() + currentKana.length(); | |
} | |
// Add any kanji / reading at the end of the string to the builder | |
if (lastKanaEnd < kanji.length()) { | |
String currentKanji = kanji.substring(lastKanaEnd+1); | |
String currentReading = reading.substring(lastReadingKanaEnd + 1); | |
output.append(String.format(RUBY, currentKanji, currentReading)); | |
numKana += currentReading.length(); | |
} | |
// Do sanity check, returning naiive substitution if it failed | |
if (numKana < reading.length()) { | |
return String.format(RUBY, kanji, reading); | |
} | |
return output.toString().trim(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment