Created
June 6, 2013 10:08
-
-
Save shukob/5720542 to your computer and use it in GitHub Desktop.
Java: Simple Japanese Hyphenation Tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class JapaneseHyphenationTokenizer { | |
protected String mSourceString = ""; | |
protected int mTokenLength = 1; | |
protected int mCurrentPositionInLine = 0; | |
protected int mCurrentLine = 0; | |
protected String[] mLines = new String[0]; | |
public JapaneseHyphenationTokenizer(int tokenLength) { | |
super(); | |
this.mTokenLength = tokenLength; | |
} | |
// Width is measured using multibytes = 2 single byte=1 manner | |
public int widthOfStringBetweenIndeces(String target, int begin, | |
int end) { | |
int res = 0; | |
char[] chars = target.toCharArray(); | |
for (int i = begin; i < end; ++i) { | |
String str = Character.toString(chars[i]); | |
byte[] bytes = new byte[0]; | |
try { | |
bytes = str.getBytes("UTF-8"); | |
} catch (UnsupportedEncodingException e) { | |
e.printStackTrace(); | |
} | |
res += Math.min(bytes.length, 2); | |
} | |
return res; | |
} | |
public int widthOfString(String string) { | |
return widthOfStringBetweenIndeces(string, 0, string.length()); | |
} | |
// Specify string to be tokenized | |
public void setSourceString(String source) { | |
this.mSourceString = source; | |
mLines = mSourceString.split("\\r?\\n"); | |
} | |
// Another token can be extracted or not | |
public boolean hasNextToken() { | |
return mLines.length > mCurrentLine | |
&& (mCurrentPositionInLine < mLines[mCurrentLine] | |
.length() || mLines[mCurrentLine].length() == 0); | |
// if this line is blank, still we need to proceed to the other | |
// line with | |
// once returning a blank string | |
} | |
// Seeks for end position that fills one of the following conditions | |
// 1. end of the line | |
// 2. width of [begin, end) string is maximized under condition | |
// that it does not exceed specified string width. | |
public int seekForEndPositionOfStringWithStringWidth(String target, | |
int begin, int stringWidth) { | |
int end = target.length(); | |
int currentWidth = 0; | |
for (int i = begin; i < target.length(); ++i) { | |
int thisWidth = widthOfStringBetweenIndeces(target, i, | |
i + 1); | |
currentWidth += thisWidth; | |
if (stringWidth == currentWidth) { | |
// end is the one next to the last character | |
end = i + 1; | |
break; | |
} else if (stringWidth < currentWidth) { | |
// if width exceeds limit, then the index must be | |
// reverted one in the left | |
end = i; | |
break; | |
} | |
} | |
return end; | |
} | |
// Substring using specified width of string that takes multibytes | |
// into account. | |
public String substringWithStringWidth(String target, int begin, | |
int stringWidth) { | |
return target.substring( | |
begin, | |
seekForEndPositionOfStringWithStringWidth(target, | |
begin, stringWidth)); | |
} | |
// Returns next line token | |
public String popNextToken() { | |
// The remaining width of this line that still not is tokenized | |
int remainingStringWidthOfThisLine = widthOfStringBetweenIndeces( | |
mLines[mCurrentLine], mCurrentPositionInLine, | |
mLines[mCurrentLine].length()); | |
// if remaining width is shorter than max width, then output all | |
// remainings of this line | |
// and move to the next line | |
if (remainingStringWidthOfThisLine < mTokenLength) { | |
String res = mLines[mCurrentLine] | |
.substring(mCurrentPositionInLine); | |
moveToNextLine(); | |
return res; | |
} else { | |
// last position filling condition of specified max token | |
// length | |
int expectedLastPositionOfThisToken = seekForEndPositionOfStringWithStringWidth( | |
mLines[mCurrentLine], mCurrentPositionInLine, | |
mTokenLength); | |
// if treating hanging character is required | |
boolean hangingRequired = hasHangingCharacterNextToPositionInString( | |
expectedLastPositionOfThisToken - 1, | |
mLines[mCurrentLine]); | |
// we include another character, i.e. a punctuation into | |
// this token. | |
if (hangingRequired) { | |
expectedLastPositionOfThisToken += 1; | |
} | |
String res = mLines[mCurrentLine].substring( | |
mCurrentPositionInLine, | |
expectedLastPositionOfThisToken); | |
if (expectedLastPositionOfThisToken == mLines[mCurrentLine] | |
.length()) { | |
// if the end position is the end of the line | |
// then move to next line | |
moveToNextLine(); | |
} else { | |
mCurrentPositionInLine = expectedLastPositionOfThisToken; | |
} | |
return res; | |
} | |
} | |
protected void moveToNextLine() { | |
mCurrentLine += 1; | |
mCurrentPositionInLine = 0; | |
} | |
final int kKutenCodePoint = "。".codePointAt(0); | |
final int kToutenCodePoint = "、".codePointAt(0); | |
// a following character at the position is punctuation or not | |
protected boolean hasHangingCharacterNextToPositionInString( | |
int position, String str) { | |
if (str.length() - 1 == position) { | |
return false; | |
} else { | |
int targetCodePoint = str.codePointAt(position + 1); | |
return targetCodePoint == kKutenCodePoint | |
|| targetCodePoint == kToutenCodePoint; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment