Skip to content

Instantly share code, notes, and snippets.

@ntninja
Created August 15, 2015 23:49
Show Gist options
  • Save ntninja/d431ea365216d05ec32d to your computer and use it in GitHub Desktop.
Save ntninja/d431ea365216d05ec32d to your computer and use it in GitHub Desktop.
import java.text.Normalizer;
// Hall of fame:
// - https://en.wikipedia.org/w/index.php?oldid=674250888 (279 reasons why you should prefer this method)
// - http://www.unicode.org/reports/tr15/tr15-23.html (Explanation of different Unicode normalization forms)
// - http://stackoverflow.com/a/361345/277882 (How to iterate over 20-bit Unicode string in Java)
// - http://stackoverflow.com/a/29111105/277882 (How to check for combining characters in Java)
class StripAccents {
public static void main(String[] args) {
// Check that diacritical marks are stripped correctly and
// compatibility mappings (for similar-looking characters
// with different codepoints are applied correctly).
String string = "BLA áèịöű bla ねこ ネコ ㎏ ṓ";
String result = Main.normalize(string);
System.out.println(result);
if(!result.equals("bla aeiou bla ねこ ネコ kg o")) {
System.out.println(" Unexpected result!");
}
}
public static String normalize(String string) {
// Convert input string to decomposed Unicode (NFD) so that the
// diacritical marks used in many European scripts (such as the
// "C WITH CIRCUMFLEX" → ĉ) become separate characters.
// Also use compatibility decomposition (K) so that characters,
// that have the exact same meaning as one or more other
// characters (such as "㎏" → "kg" or "ヒ" → "ヒ"), become match
// when searching.
string = Normalizer.normalize(string, Normalizer.Form.NFKD);
StringBuilder result = new StringBuilder();
int offset = 0, strLen = string.length();
while(offset < strLen) {
int character = string.codePointAt(offset);
offset += Character.charCount(character);
// Only process characters that are not combining Unicode
// characters. This way all the decomposed diacritical marks
// (and some other not-that-important modifiers), that were
// part of the original string or produced by the NFKD
// normalizer above, disappear.
switch(Character.getType(character)) {
case Character.NON_SPACING_MARK:
case Character.COMBINING_SPACING_MARK:
// Some combining character found
break;
default:
result.appendCodePoint(Character.toLowerCase(character));
}
}
// Since we stripped all combining Unicode characters in the
// previous while-loop there should be no combining character
// remaining in the string and the composed and decomposed
// versions of the string should be equivalent. This also means
// we do not need to convert the string back to composed Unicode
// before returning it.
return result.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment