Skip to content

Instantly share code, notes, and snippets.

@shukob
Created June 6, 2013 10:08
Show Gist options
  • Save shukob/5720542 to your computer and use it in GitHub Desktop.
Save shukob/5720542 to your computer and use it in GitHub Desktop.
Java: Simple Japanese Hyphenation Tokenizer
class JapaneseHyphenationTokenizer {
protected String mSourceString = "";
protected int mTokenLength = 1;
protected int mCurrentPositionInLine = 0;
protected int mCurrentLine = 0;
protected String[] mLines = new String[0];
public JapaneseHyphenationTokenizer(int tokenLength) {
super();
this.mTokenLength = tokenLength;
}
// Width is measured using multibytes = 2 single byte=1 manner
public int widthOfStringBetweenIndeces(String target, int begin,
int end) {
int res = 0;
char[] chars = target.toCharArray();
for (int i = begin; i < end; ++i) {
String str = Character.toString(chars[i]);
byte[] bytes = new byte[0];
try {
bytes = str.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
res += Math.min(bytes.length, 2);
}
return res;
}
public int widthOfString(String string) {
return widthOfStringBetweenIndeces(string, 0, string.length());
}
// Specify string to be tokenized
public void setSourceString(String source) {
this.mSourceString = source;
mLines = mSourceString.split("\\r?\\n");
}
// Another token can be extracted or not
public boolean hasNextToken() {
return mLines.length > mCurrentLine
&& (mCurrentPositionInLine < mLines[mCurrentLine]
.length() || mLines[mCurrentLine].length() == 0);
// if this line is blank, still we need to proceed to the other
// line with
// once returning a blank string
}
// Seeks for end position that fills one of the following conditions
// 1. end of the line
// 2. width of [begin, end) string is maximized under condition
// that it does not exceed specified string width.
public int seekForEndPositionOfStringWithStringWidth(String target,
int begin, int stringWidth) {
int end = target.length();
int currentWidth = 0;
for (int i = begin; i < target.length(); ++i) {
int thisWidth = widthOfStringBetweenIndeces(target, i,
i + 1);
currentWidth += thisWidth;
if (stringWidth == currentWidth) {
// end is the one next to the last character
end = i + 1;
break;
} else if (stringWidth < currentWidth) {
// if width exceeds limit, then the index must be
// reverted one in the left
end = i;
break;
}
}
return end;
}
// Substring using specified width of string that takes multibytes
// into account.
public String substringWithStringWidth(String target, int begin,
int stringWidth) {
return target.substring(
begin,
seekForEndPositionOfStringWithStringWidth(target,
begin, stringWidth));
}
// Returns next line token
public String popNextToken() {
// The remaining width of this line that still not is tokenized
int remainingStringWidthOfThisLine = widthOfStringBetweenIndeces(
mLines[mCurrentLine], mCurrentPositionInLine,
mLines[mCurrentLine].length());
// if remaining width is shorter than max width, then output all
// remainings of this line
// and move to the next line
if (remainingStringWidthOfThisLine < mTokenLength) {
String res = mLines[mCurrentLine]
.substring(mCurrentPositionInLine);
moveToNextLine();
return res;
} else {
// last position filling condition of specified max token
// length
int expectedLastPositionOfThisToken = seekForEndPositionOfStringWithStringWidth(
mLines[mCurrentLine], mCurrentPositionInLine,
mTokenLength);
// if treating hanging character is required
boolean hangingRequired = hasHangingCharacterNextToPositionInString(
expectedLastPositionOfThisToken - 1,
mLines[mCurrentLine]);
// we include another character, i.e. a punctuation into
// this token.
if (hangingRequired) {
expectedLastPositionOfThisToken += 1;
}
String res = mLines[mCurrentLine].substring(
mCurrentPositionInLine,
expectedLastPositionOfThisToken);
if (expectedLastPositionOfThisToken == mLines[mCurrentLine]
.length()) {
// if the end position is the end of the line
// then move to next line
moveToNextLine();
} else {
mCurrentPositionInLine = expectedLastPositionOfThisToken;
}
return res;
}
}
protected void moveToNextLine() {
mCurrentLine += 1;
mCurrentPositionInLine = 0;
}
final int kKutenCodePoint = "。".codePointAt(0);
final int kToutenCodePoint = "、".codePointAt(0);
// a following character at the position is punctuation or not
protected boolean hasHangingCharacterNextToPositionInString(
int position, String str) {
if (str.length() - 1 == position) {
return false;
} else {
int targetCodePoint = str.codePointAt(position + 1);
return targetCodePoint == kKutenCodePoint
|| targetCodePoint == kToutenCodePoint;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment