Skip to content

Instantly share code, notes, and snippets.

@hborders
Last active March 17, 2017 13:56
Show Gist options
  • Save hborders/7701826 to your computer and use it in GitHub Desktop.
Save hborders/7701826 to your computer and use it in GitHub Desktop.
How to safely work with unicode strings in Java: Inspired by http://mortoray.com/2013/11/27/the-string-type-is-broken/
import java.text.BreakIterator;
public class UnicodeTest {
public static void main(String[] args) {
String noel = "noe\u0308l";
System.out.println("noel=" + noel);
System.out.println("Broken (java.lang.String methods):");
System.out.println("noel.length=" + noel.length());
System.out.println("noel.substring(0,3)=" + noel.substring(0, 3));
System.out.println("noel.reverse="
+ new StringBuilder(noel).reverse().toString());
System.out.println();
System.out.println("Works (java.text.BreakIterator methods):");
System.out.println("noel.breakIteratorLength=" + breakIteratorLength(noel));
System.out.println("noel.breakIteratorSubstring(0,3)=" + breakIteratorSubstring(noel, 0, 3));
System.out.println("noel.breakIteraorReverse=" + breakIteratorReverse(noel));
// Output:
// noel=noël
// Broken (java.lang.String methods):
// noel.length=5
// noel.substring(0,3)=noe
// noel.reverse=l̈eon
//
// Works (java.text.BreakIterator methods):
// noel.breakIteratorLength=4
// noel.breakIteratorSubstring(0,3)=noë
// noel.breakIteraorReverse=lëon
}
private static int breakIteratorLength(String s) {
if (s.length() == 0) {
return 0;
} else {
BreakIterator characterBreakIterator = BreakIterator
.getCharacterInstance();
characterBreakIterator.setText(s);
int breakIteratorLength = -1;
for (int boundary = characterBreakIterator.first(); boundary != BreakIterator.DONE; boundary = characterBreakIterator
.next()) {
breakIteratorLength++;
}
return breakIteratorLength;
}
}
private static String breakIteratorReverse(String s) {
if (s.length() == 0) {
return "";
} else {
BreakIterator characterBreakIterator = BreakIterator.getCharacterInstance();
characterBreakIterator.setText(s);
StringBuilder reverseStringBuilder = new StringBuilder(s.length());
int closerToEndBoundary = characterBreakIterator.last();
int boundary = characterBreakIterator.previous();
while (boundary != BreakIterator.DONE) {
reverseStringBuilder.append(s.substring(boundary, closerToEndBoundary));
closerToEndBoundary = boundary;
boundary = characterBreakIterator.previous();
}
return reverseStringBuilder.toString();
}
}
private static String breakIteratorSubstring(String s, int begin, int end) {
if (s.length() == 0) {
return "";
} else {
BreakIterator characterBreakIterator = BreakIterator.getCharacterInstance();
characterBreakIterator.setText(s);
characterBreakIterator.first();
int beginBoundary = characterBreakIterator.next(begin);
int endBoundary = characterBreakIterator.next(end-begin);
return s.substring(beginBoundary, endBoundary);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment