Created
May 2, 2011 14:40
-
-
Save andrewspencer/951699 to your computer and use it in GitHub Desktop.
Remove accents from a String
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.andrewspencer.util; | |
import sun.text.Normalizer; // java.text.Normalizer in 1.6 | |
/** | |
* Here's how to remove accents in Java 1.5. | |
* NB Doesn't separate ligatures. | |
*/ | |
// WARNING Uses unofficial Sun classes, so JVM-dependent. | |
// Normalizer became standard in Java 1.6, though. | |
public final class StringUtil { | |
... | |
public static String removeAccents(String notNullSource) { | |
// Normalizer.normalise() converts each accented | |
// character into 1 non-accented character followed | |
// by 1 or more characters representing the accent(s) | |
// alone. These characters representing only | |
// an accent belong to the Unicode category | |
// CombiningDiacriticalMarks. The call to replaceAll | |
// strips out all characters in that category. | |
String normalized = sun.text.Normalizer.normalize( | |
notNullSource, | |
sun.text.Normalizer.DECOMP_COMPAT, // Normalizer.Form.NFKD in 1.6 | |
0); | |
return normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.andrewspencer.util; | |
import static org.fest.assertions.Assertions.assertThat; | |
import static net.andrewspencer.util.StringUtil.*; | |
import org.testng.annotations.Test; | |
public class StringUtilTest { | |
@Test | |
public void removeAccentsShouldRemoveAccents() { | |
assertThat(removeAccents("ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ")).isEqualTo("AAAAAACEEEEIIIINOOOOOUUUUY"); | |
assertThat(removeAccents("àáâãäåçèéêëìíîïñòóôõöùúûüýÿ")).isEqualTo("aaaaaaceeeeiiiinooooouuuuyy"); | |
} | |
@Test | |
public void removeAccentsKeepsNonAsciiCharacters() { | |
// Naïve versions of the accent stripping routine remove everything except ASCII characters. | |
// This strips out characters that look like accented characters, but are actually distinct, | |
// albeit non-ASCII, letter characters. E.g. Ð != D, Ø != O | |
assertThat(removeAccents("ÐØø")).isEqualTo("ÐØø"); | |
} | |
@Test | |
public void removeAccentsDoesNotSplitLigatures() { | |
// If it did, then ligatures like "OE" would be split into their constitutive letters, which could be useful. | |
// But I haven't found a way to do that. | |
assertThat(removeAccents("\u0152\u0153\u00DF")).isEqualTo("\u0152\u0153\u00DF"); // ligatures OE, oe, ß (ss) | |
} | |
@Test | |
public void capitaliseAndRemoveAccentsOnFirstLetterShouldDoWhatItSaysOnSingleWord() { | |
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("ëomer")).isEqualTo("Eomer"); | |
} | |
@Test | |
public void capitaliseAndRemoveAccentsOnFirstLetterShouldDoWhatItSaysOnMultipleWords() { | |
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("dol amroth")).isEqualTo("Dol Amroth"); | |
} | |
@Test | |
public void capitaliseAndRemoveAccentsOnFirstLetterShouldSwallowDoubleWhitespaceCharacters() { | |
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("minas tirith")).isEqualTo("Minas Tirith"); | |
} | |
@Test | |
public void capitaliseAndRemoveAccentsOnFirstLetterShouldNotCapitaliseAfterHyphen() { | |
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("Kheled-zâram")).isEqualTo("Kheled-zâram"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment