shukob · June 6, 2013 10:08
diff --git a/gistfile1.java b/gistfile1.java
  	class JapaneseHyphenationTokenizer {

 			protected String mSourceString = "";
 			protected int mTokenLength = 1;
 			protected int mCurrentPositionInLine = 0;
 			protected int mCurrentLine = 0;
 			protected String[] mLines = new String[0];

 			public JapaneseHyphenationTokenizer(int tokenLength) {
 				super();
 				this.mTokenLength = tokenLength;
 			}

 			// Width is measured using multibytes = 2 single byte=1 manner
 			public int widthOfStringBetweenIndeces(String target, int begin,
 					int end) {
 				int res = 0;
 				char[] chars = target.toCharArray();
 				for (int i = begin; i < end; ++i) {
 					String str = Character.toString(chars[i]);
 					byte[] bytes = new byte[0];
 					try {
 						bytes = str.getBytes("UTF-8");
 					} catch (UnsupportedEncodingException e) {
 						e.printStackTrace();
 					}

 					res += Math.min(bytes.length, 2);
 				}
 				return res;
 			}

 			public int widthOfString(String string) {
 				return widthOfStringBetweenIndeces(string, 0, string.length());
 			}

 			// Specify string to be tokenized
 			public void setSourceString(String source) {
 				this.mSourceString = source;
 				mLines = mSourceString.split("\\r?\\n");
 			}

 			// Another token can be extracted or not
 			public boolean hasNextToken() {
 				return mLines.length > mCurrentLine
 						&& (mCurrentPositionInLine < mLines[mCurrentLine]
 								.length() || mLines[mCurrentLine].length() == 0);
 				// if this line is blank, still we need to proceed to the other
 				// line with
 				// once returning a blank string
 			}

 			// Seeks for end position that fills one of the following conditions
 			// 1. end of the line
 			// 2. width of [begin, end) string is maximized under condition
 			// that it does not exceed specified string width.
 			public int seekForEndPositionOfStringWithStringWidth(String target,
 					int begin, int stringWidth) {
 				int end = target.length();
 				int currentWidth = 0;
 				for (int i = begin; i < target.length(); ++i) {
 					int thisWidth = widthOfStringBetweenIndeces(target, i,
 							i + 1);
 					currentWidth += thisWidth;
 					if (stringWidth == currentWidth) {
 						// end is the one next to the last character
 						end = i + 1;
 						break;
 					} else if (stringWidth < currentWidth) {
 						// if width exceeds limit, then the index must be
 						// reverted one in the left
 						end = i;
 						break;
 					}
 				}
 				return end;
 			}

 			// Substring using specified width of string that takes multibytes
 			// into account.
 			public String substringWithStringWidth(String target, int begin,
 					int stringWidth) {
 				return target.substring(
 						begin,
 						seekForEndPositionOfStringWithStringWidth(target,
 								begin, stringWidth));
 			}

 			// Returns next line token
 			public String popNextToken() {

 				// The remaining width of this line that still not is tokenized
 				int remainingStringWidthOfThisLine = widthOfStringBetweenIndeces(
 						mLines[mCurrentLine], mCurrentPositionInLine,
 						mLines[mCurrentLine].length());

 				// if remaining width is shorter than max width, then output all
 				// remainings of this line
 				// and move to the next line
 				if (remainingStringWidthOfThisLine < mTokenLength) {
 					String res = mLines[mCurrentLine]
 							.substring(mCurrentPositionInLine);
 					moveToNextLine();
 					return res;
 				} else {
 					// last position filling condition of specified max token
 					// length
 					int expectedLastPositionOfThisToken = seekForEndPositionOfStringWithStringWidth(
 							mLines[mCurrentLine], mCurrentPositionInLine,
 							mTokenLength);

 					// if treating hanging character is required
 					boolean hangingRequired = hasHangingCharacterNextToPositionInString(
 							expectedLastPositionOfThisToken - 1,
 							mLines[mCurrentLine]);

 					// we include another character, i.e. a punctuation into
 					// this token.
 					if (hangingRequired) {
 						expectedLastPositionOfThisToken += 1;
 					}

 					String res = mLines[mCurrentLine].substring(
 							mCurrentPositionInLine,
 							expectedLastPositionOfThisToken);

 					if (expectedLastPositionOfThisToken == mLines[mCurrentLine]
 							.length()) {
 						// if the end position is the end of the line
 						// then move to next line
 						moveToNextLine();
 					} else {
 						mCurrentPositionInLine = expectedLastPositionOfThisToken;
 					}
 					return res;
 				}
 			}

 			protected void moveToNextLine() {
 				mCurrentLine += 1;
 				mCurrentPositionInLine = 0;
 			}

 			final int kKutenCodePoint = "。".codePointAt(0);
 			final int kToutenCodePoint = "、".codePointAt(0);

 			// a following character at the position is punctuation or not
 			protected boolean hasHangingCharacterNextToPositionInString(
 					int position, String str) {
 				if (str.length() - 1 == position) {
 					return false;
 				} else {
 					int targetCodePoint = str.codePointAt(position + 1);
 					return targetCodePoint == kKutenCodePoint
 							|| targetCodePoint == kToutenCodePoint;
 				}
 			}

 		}
	class JapaneseHyphenationTokenizer {

	protected String mSourceString = "";
	protected int mTokenLength = 1;
	protected int mCurrentPositionInLine = 0;
	protected int mCurrentLine = 0;
	protected String[] mLines = new String[0];

	public JapaneseHyphenationTokenizer(int tokenLength) {
	super();
	this.mTokenLength = tokenLength;
	}

	// Width is measured using multibytes = 2 single byte=1 manner
	public int widthOfStringBetweenIndeces(String target, int begin,
	int end) {
	int res = 0;
	char[] chars = target.toCharArray();
	for (int i = begin; i < end; ++i) {
	String str = Character.toString(chars[i]);
	byte[] bytes = new byte[0];
	try {
	bytes = str.getBytes("UTF-8");
	} catch (UnsupportedEncodingException e) {
	e.printStackTrace();
	}

	res += Math.min(bytes.length, 2);
	}
	return res;
	}

	public int widthOfString(String string) {
	return widthOfStringBetweenIndeces(string, 0, string.length());
	}

	// Specify string to be tokenized
	public void setSourceString(String source) {
	this.mSourceString = source;
	mLines = mSourceString.split("\\r?\\n");
	}

	// Another token can be extracted or not
	public boolean hasNextToken() {
	return mLines.length > mCurrentLine
	&& (mCurrentPositionInLine < mLines[mCurrentLine]
	.length() \|\| mLines[mCurrentLine].length() == 0);
	// if this line is blank, still we need to proceed to the other
	// line with
	// once returning a blank string
	}

	// Seeks for end position that fills one of the following conditions
	// 1. end of the line
	// 2. width of [begin, end) string is maximized under condition
	// that it does not exceed specified string width.
	public int seekForEndPositionOfStringWithStringWidth(String target,
	int begin, int stringWidth) {
	int end = target.length();
	int currentWidth = 0;
	for (int i = begin; i < target.length(); ++i) {
	int thisWidth = widthOfStringBetweenIndeces(target, i,
	i + 1);
	currentWidth += thisWidth;
	if (stringWidth == currentWidth) {
	// end is the one next to the last character
	end = i + 1;
	break;
	} else if (stringWidth < currentWidth) {
	// if width exceeds limit, then the index must be
	// reverted one in the left
	end = i;
	break;
	}
	}
	return end;
	}

	// Substring using specified width of string that takes multibytes
	// into account.
	public String substringWithStringWidth(String target, int begin,
	int stringWidth) {
	return target.substring(
	begin,
	seekForEndPositionOfStringWithStringWidth(target,
	begin, stringWidth));
	}

	// Returns next line token
	public String popNextToken() {

	// The remaining width of this line that still not is tokenized
	int remainingStringWidthOfThisLine = widthOfStringBetweenIndeces(
	mLines[mCurrentLine], mCurrentPositionInLine,
	mLines[mCurrentLine].length());

	// if remaining width is shorter than max width, then output all
	// remainings of this line
	// and move to the next line
	if (remainingStringWidthOfThisLine < mTokenLength) {
	String res = mLines[mCurrentLine]
	.substring(mCurrentPositionInLine);
	moveToNextLine();
	return res;
	} else {
	// last position filling condition of specified max token
	// length
	int expectedLastPositionOfThisToken = seekForEndPositionOfStringWithStringWidth(
	mLines[mCurrentLine], mCurrentPositionInLine,
	mTokenLength);

	// if treating hanging character is required
	boolean hangingRequired = hasHangingCharacterNextToPositionInString(
	expectedLastPositionOfThisToken - 1,
	mLines[mCurrentLine]);

	// we include another character, i.e. a punctuation into
	// this token.
	if (hangingRequired) {
	expectedLastPositionOfThisToken += 1;
	}

	String res = mLines[mCurrentLine].substring(
	mCurrentPositionInLine,
	expectedLastPositionOfThisToken);

	if (expectedLastPositionOfThisToken == mLines[mCurrentLine]
	.length()) {
	// if the end position is the end of the line
	// then move to next line
	moveToNextLine();
	} else {
	mCurrentPositionInLine = expectedLastPositionOfThisToken;
	}
	return res;
	}
	}

	protected void moveToNextLine() {
	mCurrentLine += 1;
	mCurrentPositionInLine = 0;
	}

	final int kKutenCodePoint = "。".codePointAt(0);
	final int kToutenCodePoint = "、".codePointAt(0);

	// a following character at the position is punctuation or not
	protected boolean hasHangingCharacterNextToPositionInString(
	int position, String str) {
	if (str.length() - 1 == position) {
	return false;
	} else {
	int targetCodePoint = str.codePointAt(position + 1);
	return targetCodePoint == kKutenCodePoint
	\|\| targetCodePoint == kToutenCodePoint;
	}
	}

	}