ntninja · August 15, 2015 23:49
diff --git a/StripAccents.java b/StripAccents.java
 import java.text.Normalizer;

 // Hall of fame:
 //  - https://en.wikipedia.org/w/index.php?oldid=674250888 (279 reasons why you should prefer this method)
 //  - http://www.unicode.org/reports/tr15/tr15-23.html     (Explanation of different Unicode normalization forms)
 //  - http://stackoverflow.com/a/361345/277882             (How to iterate over 20-bit Unicode string in Java)
 //  - http://stackoverflow.com/a/29111105/277882           (How to check for combining characters in Java)

 class StripAccents {
 	public static void main(String[] args) {
 		// Check that diacritical marks are stripped correctly and
 		// compatibility mappings (for similar-looking characters
 		// with different codepoints are applied correctly).
 		String string = "BLA áèịöű bla ねこ ﾈｺ ㎏ ṓ";
 		String result = Main.normalize(string);
 		System.out.println(result);
 		if(!result.equals("bla aeiou bla ねこ ネコ kg o")) {
 			System.out.println("  Unexpected result!");
 		}
 	}
 	
 	public static String normalize(String string) {
 		// Convert input string to decomposed Unicode (NFD) so that the
 		// diacritical marks used in many European scripts (such as the
 		// "C WITH CIRCUMFLEX" → ĉ) become separate characters.
 		// Also use compatibility decomposition (K) so that characters,
 		// that have the exact same meaning as one or more other
 		// characters (such as "㎏" → "kg" or "ﾋ" → "ヒ"), become match
 		// when searching.
 		string = Normalizer.normalize(string, Normalizer.Form.NFKD);
 		
 		StringBuilder result = new StringBuilder();
 		
 		int offset = 0, strLen = string.length();
 		while(offset < strLen) {
 			int character = string.codePointAt(offset);
 			offset += Character.charCount(character);
 			
 			// Only process characters that are not combining Unicode
 			// characters. This way all the decomposed diacritical marks
 			// (and some other not-that-important modifiers), that were
 			// part of the original string or produced by the NFKD
 			// normalizer above, disappear.
 			switch(Character.getType(character)) {
 				case Character.NON_SPACING_MARK:
 				case Character.COMBINING_SPACING_MARK:
 					// Some combining character found
 				break;
 				
 				default:
 					result.appendCodePoint(Character.toLowerCase(character));
 			}
 		}
 		
 		// Since we stripped all combining Unicode characters in the
 		// previous while-loop there should be no combining character
 		// remaining in the string and the composed and decomposed
 		// versions of the string should be equivalent. This also means
 		// we do not need to convert the string back to composed Unicode
 		// before returning it.
 		return result.toString();
 	}
 }
	import java.text.Normalizer;

	// Hall of fame:
	// - https://en.wikipedia.org/w/index.php?oldid=674250888 (279 reasons why you should prefer this method)
	// - http://www.unicode.org/reports/tr15/tr15-23.html (Explanation of different Unicode normalization forms)
	// - http://stackoverflow.com/a/361345/277882 (How to iterate over 20-bit Unicode string in Java)
	// - http://stackoverflow.com/a/29111105/277882 (How to check for combining characters in Java)

	class StripAccents {
	public static void main(String[] args) {
	// Check that diacritical marks are stripped correctly and
	// compatibility mappings (for similar-looking characters
	// with different codepoints are applied correctly).
	String string = "BLA áèịöű bla ねこﾈｺ㎏ ṓ";
	String result = Main.normalize(string);
	System.out.println(result);
	if(!result.equals("bla aeiou bla ねこネコ kg o")) {
	System.out.println(" Unexpected result!");
	}
	}

	public static String normalize(String string) {
	// Convert input string to decomposed Unicode (NFD) so that the
	// diacritical marks used in many European scripts (such as the
	// "C WITH CIRCUMFLEX" → ĉ) become separate characters.
	// Also use compatibility decomposition (K) so that characters,
	// that have the exact same meaning as one or more other
	// characters (such as "㎏" → "kg" or "ﾋ" → "ヒ"), become match
	// when searching.
	string = Normalizer.normalize(string, Normalizer.Form.NFKD);

	StringBuilder result = new StringBuilder();

	int offset = 0, strLen = string.length();
	while(offset < strLen) {
	int character = string.codePointAt(offset);
	offset += Character.charCount(character);

	// Only process characters that are not combining Unicode
	// characters. This way all the decomposed diacritical marks
	// (and some other not-that-important modifiers), that were
	// part of the original string or produced by the NFKD
	// normalizer above, disappear.
	switch(Character.getType(character)) {
	case Character.NON_SPACING_MARK:
	case Character.COMBINING_SPACING_MARK:
	// Some combining character found
	break;

	default:
	result.appendCodePoint(Character.toLowerCase(character));
	}
	}

	// Since we stripped all combining Unicode characters in the
	// previous while-loop there should be no combining character
	// remaining in the string and the composed and decomposed
	// versions of the string should be equivalent. This also means
	// we do not need to convert the string back to composed Unicode
	// before returning it.
	return result.toString();
	}
	}