Created
March 19, 2012 13:16
-
-
Save jabbrwcky/2111727 to your computer and use it in GitHub Desktop.
StringCleaner: a demonstration how to use ICU4j to convert practically any UTF-8/Java String to some ASCII compatible form. Requires ICU4J, TestNG and Findbugs-Annotations.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.hausherr.util; | |
import com.ibm.icu.text.Normalizer; | |
import java.util.HashMap; | |
import java.util.Map; | |
/** | |
* StringCleaner provides a method for normalizing a string to generally | |
* ASCII-compatible form. | |
*/ | |
public class StringCleaner { | |
/** | |
* Map containing replacements for corner cases (i.e. not decomposed by the | |
* Normalizer) | |
*/ | |
private Map<Integer, Replacement> charMap = buildReplacementMap(); | |
/** | |
* builds a map containing all replacements that are not automatically | |
* performed by the normalizer. | |
* | |
* @return Replacement containing both replacements for upper- and lowercase | |
* mode. | |
*/ | |
private Map<Integer, Replacement> buildReplacementMap() { | |
Map<Integer, Replacement> map = new HashMap<Integer, Replacement>(); | |
map.put(0xc6, new Replacement("AE", "Ae")); | |
map.put(0xe6, new Replacement("ae")); | |
map.put(0xd0, new Replacement("D")); | |
map.put(0x111, new Replacement("d")); | |
map.put(0xd8, new Replacement("O")); | |
map.put(0xf8, new Replacement("o")); | |
map.put(0x152, new Replacement("OE", "Oe")); | |
map.put(0x153, new Replacement("oe")); | |
map.put(0x166, new Replacement("T")); | |
map.put(0x167, new Replacement("t")); | |
return map; | |
} | |
/** | |
* <p> | |
* This method takes an input String and replaces all special characters | |
* like umlauts, accented or other letter with diacritical marks with their | |
* basic ascii eqivalents. | |
* </p> | |
* <p> | |
* Example: The String "André" or "Ándre" would be converted to "Andre". | |
* </p> | |
* <p> | |
* The flag <code>replaceAllCapitalLetters</code> controls the replacement | |
* behavior of special characters that are decomposed into two plain ASCII | |
* chars, like "Æ" or "æ". | |
* </p> | |
* <p> | |
* In "lowercase" mode (i.e.<code> replaceAllCapitalLetters=false</code> ) | |
* both aforementioned examples would be converted to "Ae". | |
* </p> | |
* <p> | |
* In "uppercase" mode (<code>replaceAllCapitalLetters=false</code>) the | |
* replacement would be "AE". | |
* </p> | |
* | |
* @param input String to convert | |
* @param replaceAllCapitalLetters <code>true</code> causes uppercase special chars that are | |
* replaced by more than one character to be replaced by | |
* all-uppercase replacements; <code>false</code> will cause only | |
* the initial character of the replacements to be in uppercase | |
* and all subsequent replacement characters will be in | |
* lowercase. | |
* @return Input string reduced to ASCII-safe characters. | |
*/ | |
public String convertToAscii(String input, boolean replaceAllCapitalLetters) { | |
/* | |
* operating on char arrays because java.lang.String seems to perform an | |
* automatic recomposition of decomposed characters. | |
*/ | |
String result = null; | |
if (null != input) { | |
char[] src = input.toCharArray(); | |
/* save space for exotic UTF characters */ | |
char[] target = new char[src.length * 3]; | |
int len = Normalizer.normalize(input.toCharArray(), target, Normalizer.NFKD, 0); | |
result = processSpecialChars(target, 0, len, replaceAllCapitalLetters); | |
} | |
return result; | |
} | |
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = {"SF_SWITCH_FALLTHROUGH"}, justification = "Intentional fallthrough (JHAUSHER)") | |
private String processSpecialChars(char[] target, int offset, int len, boolean uppercase) { | |
StringBuilder result = new StringBuilder(); | |
boolean skip = false; | |
for (int i = 0; i < len; i++) { | |
if (skip) { | |
skip = false; | |
} else { | |
char c = target[i]; | |
if ((c > 0x20 && c < 0x40) || (c > 0x7a && c < 0xc0) || (c > 0x5a && c < 0x61) || (c > 0x79 && c < 0xc0) || c == 0xd7 || c == 0xf7) { | |
result.append(c); | |
} else if (Character.isDigit(c) || Character.isISOControl(c)) { | |
result.append(c); | |
} else if (Character.isWhitespace(c) || Character.isLetter(c)) { | |
boolean isUpper = false; | |
switch (c) { | |
case '\u00df': | |
result.append("ss"); | |
break; | |
/* Handling of capital and lowercase umlauts */ | |
case 'A': | |
case 'O': | |
case 'U': | |
isUpper = true; | |
case 'a': | |
case 'o': | |
case 'u': | |
result.append(c); | |
if (i + 1 < target.length && target[i + 1] == 0x308) { | |
result.append(isUpper && uppercase ? 'E' : 'e'); | |
skip = true; | |
} | |
break; | |
default: | |
Replacement rep = charMap.get(Integer.valueOf(c)); | |
if (rep != null) { | |
result.append(uppercase ? rep.upper : rep.lower); | |
} else | |
result.append(c); | |
} | |
} | |
} | |
} | |
return result.toString(); | |
} | |
/** | |
* Combination of replacements for upper- and lowercase mode. | |
*/ | |
private static class Replacement { | |
private final String upper; | |
private final String lower; | |
Replacement(String ucReplacement, String lcReplacement) { | |
this.upper = ucReplacement; | |
this.lower = lcReplacement; | |
} | |
Replacement(String caseInsensitiveReplacement) { | |
this(caseInsensitiveReplacement, caseInsensitiveReplacement); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.hausherr.util; | |
import org.testng.annotations.AfterClass; | |
import org.testng.annotations.BeforeClass; | |
import org.testng.annotations.DataProvider; | |
import org.testng.annotations.Test; | |
import java.lang.reflect.Method; | |
import java.util.Iterator; | |
import static org.testng.Assert.assertEquals; | |
/** | |
* Unit-Test for testing charachter replacement. | |
*/ | |
public class StringCleanerTest { | |
private StringCleaner sut; | |
@BeforeClass | |
public void setupClass() { | |
sut = new StringCleaner(); | |
} | |
@AfterClass | |
public void tearDownClass() { | |
} | |
/** | |
* Tests replacement of special characters in "lowercase" mode. | |
* | |
* @param testData | |
* Char/String to test | |
* @param expected | |
* expected normalized form | |
*/ | |
@Test(dataProvider = "asciiConversion") | |
void testReplaceSpecialCharacters(String testData, String expected) { | |
String result = sut.convertToAscii(testData, false); | |
assertEquals(result, expected); | |
} | |
/** | |
* | |
* Tests replacement of special characters in "uppercase" mode. | |
* | |
* @param testData | |
* Char/String to test | |
* @param expected | |
* expected normalized form | |
*/ | |
@Test(dataProvider = "asciiConversion") | |
void testReplaceSpecialCharactersUppercase(String testData, String expected) { | |
String result = sut.convertToAscii(testData, true); | |
assertEquals(result, expected); | |
} | |
/** | |
* Data provider for the test methods either for upper- or lowercase mode.. | |
* | |
* Provides Data both for upper- and lowercase tests as Iterator over the | |
* array of String arrays that holds the "raw" data. | |
* | |
* @param m | |
* actual testmethod, provided by TestNG | |
* @return Iterator over test data. | |
*/ | |
@DataProvider(name = "asciiConversion") | |
public Iterator<Object[]> dataProvider(Method m) { | |
if (m.getName().endsWith("Uppercase")) { | |
return new TestDataIterator(true); | |
} | |
return new TestDataIterator(false); | |
} | |
/** | |
* This array of String Arrays holds the data for all tests. | |
* | |
* <p> | |
* The data contained in each array has the following semantics: | |
* </p> | |
* <code>{ "EXP(lc)", "EXP(uc)", "TV_1", ... , "TV-n" }</code> | |
* <p> | |
* Legend: | |
* </p> | |
* <ul> | |
* <li>EXP(lc): Reference value for lowercase tests (expectation)</li> | |
* <li>EXP(uc): Reference value for uppercase tests (expectation)</li> | |
* <li>TV_1...TV_n: Test String/Character 1...n</li> | |
* </ul> | |
*/ | |
private static final String[][] testDataSource = { | |
// | |
/* Sanity checks first */ | |
{ "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ" },// | |
{ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz" },// | |
{ "1234567890", "1234567890", "1234567890" },// | |
/* Symbols */ | |
{ "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\" },// | |
{ "÷ס¢£¤·", "÷ס¢£¤·", "÷ס¢£¤·" }, // | |
{ " \t\r\n", " \t\r\n", " \t\r\n" },// | |
/* Fun starts here */ | |
{ "A", "A", "Ā", "Ă", "Å", "À", "Â" }, // | |
{ "a", "a", "ā", "ă", "å", "à", "â" }, // | |
{ "Ae", "AE", "Æ", "Ǽ", "Ä" },// | |
{ "ae", "ae", "æ", "ǣ", "ä", "ǟ" },// | |
{ "C", "C", "Ċ", "Ç", "Č" },// | |
{ "c", "c", "ċ", "ç", "č" },// | |
{ "D", "D", "Ď", "Ð" },// | |
{ "d", "d", "ď", "đ" },// | |
{ "E", "E", "Ê", "Ë", "È", "É", "Ê" }, // | |
{ "e", "e", "ê", "ë", "è", "é" }, // | |
{ "G", "G", "Ĝ", "Ğ", "Ġ", "Ģ" },// | |
{ "g", "g", "ĝ", "ğ", "ġ", "ģ" },// | |
{ "I", "I", "Ì", "Í", "Î", "Ï", "Ĩ" }, // | |
{ "i", "i", "ĩ", "ì", "í", "î", "ï" },// | |
{ "N", "N", "Ñ" }, // | |
{ "n", "n", "ñ", },// | |
{ "O", "O", "Ø", "Ò", "Ó", "Ô", "Õ", "Ő", "Ǿ" },// | |
{ "o", "o", "ø", "ő", "ò", "ó", "ô", "õ", "ǿ" },// | |
{ "Oe", "OE", "Ö", "Œ" },// | |
{ "oe", "oe", "ö", "œ" },// | |
{ "ss", "ss", "ß" },// | |
{ "Aeffin", "AEffin", "Ä\uFB03n" },// | |
{ "IJ", "IJ", "IJ" },// | |
{ "ij", "ij", "ij" },// | |
{ "U", "U", "Û", "Ù", "Ú", "Ů" },// | |
{ "u", "u", "û", "ù", "ú", "ů" },// | |
{ "Ue", "UE", "Ü" },// | |
{ "ue", "ue", "ü" },// | |
{ "T", "T", "Ţ", "Ŧ" },// | |
{ "t", "t", "ţ", "ŧ" },// | |
{ "Y", "Y", "Ý" }, // | |
{ "y", "y", "ý", "ÿ" } // | |
}; | |
/** | |
* Implementation of an iterator that knows how to iterate over the source | |
* test data array for upper- and lowercase mode. | |
* | |
* @author JHAUSHER | |
*/ | |
private static final class TestDataIterator implements Iterator<Object[]> { | |
int dataIndex = 0; | |
int currentIndex = 2; | |
final boolean uppercase; | |
public TestDataIterator(boolean uppercase) { | |
this.uppercase = uppercase; | |
} | |
public boolean hasNext() { | |
return !(dataIndex == testDataSource.length && currentIndex != testDataSource[testDataSource.length - 1].length - 1); | |
} | |
public Object[] next() { | |
Object[] result = new Object[2]; | |
int idx = currentIndex++; | |
result[0] = testDataSource[dataIndex][idx]; | |
result[1] = (uppercase ? testDataSource[dataIndex][1] : testDataSource[dataIndex][0]); | |
if (currentIndex == testDataSource[dataIndex].length) { | |
currentIndex = 2; | |
dataIndex += 1; | |
} | |
return result; | |
} | |
public void remove() { | |
// ignore | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment