Created
August 15, 2015 23:49
-
-
Save ntninja/d431ea365216d05ec32d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.text.Normalizer; | |
// Hall of fame: | |
// - https://en.wikipedia.org/w/index.php?oldid=674250888 (279 reasons why you should prefer this method) | |
// - http://www.unicode.org/reports/tr15/tr15-23.html (Explanation of different Unicode normalization forms) | |
// - http://stackoverflow.com/a/361345/277882 (How to iterate over 20-bit Unicode string in Java) | |
// - http://stackoverflow.com/a/29111105/277882 (How to check for combining characters in Java) | |
class StripAccents { | |
public static void main(String[] args) { | |
// Check that diacritical marks are stripped correctly and | |
// compatibility mappings (for similar-looking characters | |
// with different codepoints are applied correctly). | |
String string = "BLA áèịöű bla ねこ ネコ ㎏ ṓ"; | |
String result = Main.normalize(string); | |
System.out.println(result); | |
if(!result.equals("bla aeiou bla ねこ ネコ kg o")) { | |
System.out.println(" Unexpected result!"); | |
} | |
} | |
public static String normalize(String string) { | |
// Convert input string to decomposed Unicode (NFD) so that the | |
// diacritical marks used in many European scripts (such as the | |
// "C WITH CIRCUMFLEX" → ĉ) become separate characters. | |
// Also use compatibility decomposition (K) so that characters, | |
// that have the exact same meaning as one or more other | |
// characters (such as "㎏" → "kg" or "ヒ" → "ヒ"), become match | |
// when searching. | |
string = Normalizer.normalize(string, Normalizer.Form.NFKD); | |
StringBuilder result = new StringBuilder(); | |
int offset = 0, strLen = string.length(); | |
while(offset < strLen) { | |
int character = string.codePointAt(offset); | |
offset += Character.charCount(character); | |
// Only process characters that are not combining Unicode | |
// characters. This way all the decomposed diacritical marks | |
// (and some other not-that-important modifiers), that were | |
// part of the original string or produced by the NFKD | |
// normalizer above, disappear. | |
switch(Character.getType(character)) { | |
case Character.NON_SPACING_MARK: | |
case Character.COMBINING_SPACING_MARK: | |
// Some combining character found | |
break; | |
default: | |
result.appendCodePoint(Character.toLowerCase(character)); | |
} | |
} | |
// Since we stripped all combining Unicode characters in the | |
// previous while-loop there should be no combining character | |
// remaining in the string and the composed and decomposed | |
// versions of the string should be equivalent. This also means | |
// we do not need to convert the string back to composed Unicode | |
// before returning it. | |
return result.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment