Skip to content

Instantly share code, notes, and snippets.

@andrewspencer
Created May 2, 2011 14:40
Show Gist options
  • Save andrewspencer/951699 to your computer and use it in GitHub Desktop.
Save andrewspencer/951699 to your computer and use it in GitHub Desktop.
Remove accents from a String
package net.andrewspencer.util;
import sun.text.Normalizer; // java.text.Normalizer in 1.6
/**
* Here's how to remove accents in Java 1.5.
* NB Doesn't separate ligatures.
*/
// WARNING Uses unofficial Sun classes, so JVM-dependent.
// Normalizer became standard in Java 1.6, though.
public final class StringUtil {
...
public static String removeAccents(String notNullSource) {
// Normalizer.normalise() converts each accented
// character into 1 non-accented character followed
// by 1 or more characters representing the accent(s)
// alone. These characters representing only
// an accent belong to the Unicode category
// CombiningDiacriticalMarks. The call to replaceAll
// strips out all characters in that category.
String normalized = sun.text.Normalizer.normalize(
notNullSource,
sun.text.Normalizer.DECOMP_COMPAT, // Normalizer.Form.NFKD in 1.6
0);
return normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
}
}
package net.andrewspencer.util;
import static org.fest.assertions.Assertions.assertThat;
import static net.andrewspencer.util.StringUtil.*;
import org.testng.annotations.Test;
public class StringUtilTest {
@Test
public void removeAccentsShouldRemoveAccents() {
assertThat(removeAccents("ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ")).isEqualTo("AAAAAACEEEEIIIINOOOOOUUUUY");
assertThat(removeAccents("àáâãäåçèéêëìíîïñòóôõöùúûüýÿ")).isEqualTo("aaaaaaceeeeiiiinooooouuuuyy");
}
@Test
public void removeAccentsKeepsNonAsciiCharacters() {
// Naïve versions of the accent stripping routine remove everything except ASCII characters.
// This strips out characters that look like accented characters, but are actually distinct,
// albeit non-ASCII, letter characters. E.g. Ð != D, Ø != O
assertThat(removeAccents("ÐØø")).isEqualTo("ÐØø");
}
@Test
public void removeAccentsDoesNotSplitLigatures() {
// If it did, then ligatures like "OE" would be split into their constitutive letters, which could be useful.
// But I haven't found a way to do that.
assertThat(removeAccents("\u0152\u0153\u00DF")).isEqualTo("\u0152\u0153\u00DF"); // ligatures OE, oe, ß (ss)
}
@Test
public void capitaliseAndRemoveAccentsOnFirstLetterShouldDoWhatItSaysOnSingleWord() {
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("ëomer")).isEqualTo("Eomer");
}
@Test
public void capitaliseAndRemoveAccentsOnFirstLetterShouldDoWhatItSaysOnMultipleWords() {
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("dol amroth")).isEqualTo("Dol Amroth");
}
@Test
public void capitaliseAndRemoveAccentsOnFirstLetterShouldSwallowDoubleWhitespaceCharacters() {
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("minas tirith")).isEqualTo("Minas Tirith");
}
@Test
public void capitaliseAndRemoveAccentsOnFirstLetterShouldNotCapitaliseAfterHyphen() {
assertThat(capitaliseAndRemoveAccentsOnFirstLetter("Kheled-zâram")).isEqualTo("Kheled-zâram");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment