jabbrwcky · March 19, 2012 13:16
diff --git a/StringCleaner.java b/StringCleaner.java
 package net.hausherr.util;

 import com.ibm.icu.text.Normalizer;

 import java.util.HashMap;
 import java.util.Map;

 /**
 * StringCleaner provides a method for normalizing a string to generally
 * ASCII-compatible form.
 */
 public class StringCleaner {

 	/**
 	 * Map containing replacements for corner cases (i.e. not decomposed by the
 	 * Normalizer)
 	 */
 	private Map<Integer, Replacement> charMap = buildReplacementMap();

 	/**
 	 * builds a map containing all replacements that are not automatically
 	 * performed by the normalizer.
 	 *
 	 * @return Replacement containing both replacements for upper- and lowercase
 	 *         mode.
 	 */
 	private Map<Integer, Replacement> buildReplacementMap() {
 		Map<Integer, Replacement> map = new HashMap<Integer, Replacement>();
 		map.put(0xc6, new Replacement("AE", "Ae"));
 		map.put(0xe6, new Replacement("ae"));
 		map.put(0xd0, new Replacement("D"));
 		map.put(0x111, new Replacement("d"));
 		map.put(0xd8, new Replacement("O"));
 		map.put(0xf8, new Replacement("o"));
 		map.put(0x152, new Replacement("OE", "Oe"));
 		map.put(0x153, new Replacement("oe"));
 		map.put(0x166, new Replacement("T"));
 		map.put(0x167, new Replacement("t"));
 		return map;
 	}

 	/**
 	 * <p>
 	 * This method takes an input String and replaces all special characters
 	 * like umlauts, accented or other letter with diacritical marks with their
 	 * basic ascii eqivalents.
 	 * </p>
 	 * <p>
 	 * Example: The String "André" or "Ándre" would be converted to "Andre".
 	 * </p>
 	 * <p>
 	 * The flag <code>replaceAllCapitalLetters</code> controls the replacement
 	 * behavior of special characters that are decomposed into two plain ASCII
 	 * chars, like "Æ" or "æ".
 	 * </p>
 	 * <p>
 	 * In "lowercase" mode (i.e.<code> replaceAllCapitalLetters=false</code> )
 	 * both aforementioned examples would be converted to "Ae".
 	 * </p>
 	 * <p>
 	 * In "uppercase" mode (<code>replaceAllCapitalLetters=false</code>) the
 	 * replacement would be "AE".
 	 * </p>
 	 *
 	 * @param input                    String to convert
 	 * @param replaceAllCapitalLetters <code>true</code> causes uppercase special chars that are
 	 *                                 replaced by more than one character to be replaced by
 	 *                                 all-uppercase replacements; <code>false</code> will cause only
 	 *                                 the initial character of the replacements to be in uppercase
 	 *                                 and all subsequent replacement characters will be in
 	 *                                 lowercase.
 	 * @return Input string reduced to ASCII-safe characters.
 	 */
 	public String convertToAscii(String input, boolean replaceAllCapitalLetters) {
 		/*
 		 * operating on char arrays because java.lang.String seems to perform an
 		 * automatic recomposition of decomposed characters.
 		 */
 		String result = null;
 		if (null != input) {
 			char[] src = input.toCharArray();
 			/* save space for exotic UTF characters */
 			char[] target = new char[src.length * 3];
 			int len = Normalizer.normalize(input.toCharArray(), target, Normalizer.NFKD, 0);
 			result = processSpecialChars(target, 0, len, replaceAllCapitalLetters);
 		}
 		return result;
 	}

 	@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = {"SF_SWITCH_FALLTHROUGH"}, justification = "Intentional fallthrough (JHAUSHER)")
 	private String processSpecialChars(char[] target, int offset, int len, boolean uppercase) {
 		StringBuilder result = new StringBuilder();
 		boolean skip = false;

 		for (int i = 0; i < len; i++) {
 			if (skip) {
 				skip = false;
 			} else {
 				char c = target[i];
 				if ((c > 0x20 && c < 0x40) || (c > 0x7a && c < 0xc0) || (c > 0x5a && c < 0x61) || (c > 0x79 && c < 0xc0) || c == 0xd7 || c == 0xf7) {
 					result.append(c);
 				} else if (Character.isDigit(c) || Character.isISOControl(c)) {
 					result.append(c);
 				} else if (Character.isWhitespace(c) || Character.isLetter(c)) {
 					boolean isUpper = false;

 					switch (c) {
 						case '\u00df':
 							result.append("ss");
 							break;
 						/* Handling of capital and lowercase umlauts */
 						case 'A':
 						case 'O':
 						case 'U':
 							isUpper = true;
 						case 'a':
 						case 'o':
 						case 'u':
 							result.append(c);
 							if (i + 1 < target.length && target[i + 1] == 0x308) {
 								result.append(isUpper && uppercase ? 'E' : 'e');
 								skip = true;
 							}
 							break;
 						default:
 							Replacement rep = charMap.get(Integer.valueOf(c));
 							if (rep != null) {
 								result.append(uppercase ? rep.upper : rep.lower);
 							} else
 								result.append(c);
 					}
 				}
 			}
 		}

 		return result.toString();
 	}


 	/**
 	 * Combination of replacements for upper- and lowercase mode.
 	 */
 	private static class Replacement {

 		private final String upper;
 		private final String lower;

 		Replacement(String ucReplacement, String lcReplacement) {
 			this.upper = ucReplacement;
 			this.lower = lcReplacement;
 		}

 		Replacement(String caseInsensitiveReplacement) {
 			this(caseInsensitiveReplacement, caseInsensitiveReplacement);
 		}

 	}
 }
diff --git a/StringCleanerTest.java b/StringCleanerTest.java
 package net.hausherr.util;

 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;

 import java.lang.reflect.Method;
 import java.util.Iterator;

 import static org.testng.Assert.assertEquals;

 /**
 * Unit-Test for testing charachter replacement.
 */
 public class StringCleanerTest {

 	private StringCleaner sut;

 	@BeforeClass
 	public void setupClass() {
 		sut = new StringCleaner();
 	}

 	@AfterClass
 	public void tearDownClass() {

 	}
 	/**
 	 * Tests replacement of special characters in "lowercase" mode.
 	 *
 	 * @param testData
 	 *            Char/String to test
 	 * @param expected
 	 *            expected normalized form
 	 */
 	@Test(dataProvider = "asciiConversion")
 	void testReplaceSpecialCharacters(String testData, String expected) {
 		String result = sut.convertToAscii(testData, false);
 		assertEquals(result, expected);
 	}

 	/**
 	 *
 	 * Tests replacement of special characters in "uppercase" mode.
 	 *
 	 * @param testData
 	 *            Char/String to test
 	 * @param expected
 	 *            expected normalized form
 	 */
 	@Test(dataProvider = "asciiConversion")
 	void testReplaceSpecialCharactersUppercase(String testData, String expected) {
 		String result = sut.convertToAscii(testData, true);
 		assertEquals(result, expected);
 	}

 	/**
 	 * Data provider for the test methods either for upper- or lowercase mode..
 	 *
 	 * Provides Data both for upper- and lowercase tests as Iterator over the
 	 * array of String arrays that holds the "raw" data.
 	 *
 	 * @param m
 	 *            actual testmethod, provided by TestNG
 	 * @return Iterator over test data.
 	 */
 	@DataProvider(name = "asciiConversion")
 	public Iterator<Object[]> dataProvider(Method m) {

 		if (m.getName().endsWith("Uppercase")) {
 			return new TestDataIterator(true);
 		}

 		return new TestDataIterator(false);

 	}

 	/**
 	 * This array of String Arrays holds the data for all tests.
 	 *
 	 * <p>
 	 * The data contained in each array has the following semantics:
 	 * </p>
 	 * <code>{ "EXP(lc)", "EXP(uc)", "TV_1", ... , "TV-n" }</code>
 	 * <p>
 	 * Legend:
 	 * </p>
 	 * <ul>
 	 * <li>EXP(lc): Reference value for lowercase tests (expectation)</li>
 	 * <li>EXP(uc): Reference value for uppercase tests (expectation)</li>
 	 * <li>TV_1...TV_n: Test String/Character 1...n</li>
 	 * </ul>
 	 */
 	private static final String[][] testDataSource = {
 			//
 			/* Sanity checks first */
 			{ "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ" },//
 			{ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz" },//
 			{ "1234567890", "1234567890", "1234567890" },//
 			/* Symbols */
 			{ "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\" },//
 			{ "÷×¡¢£¤·", "÷×¡¢£¤·", "÷×¡¢£¤·" }, //
 			{ " \t\r\n", " \t\r\n", " \t\r\n" },//
 			/* Fun starts here */
 			{ "A", "A", "Ā", "Ă", "Å", "À", "Â" }, //
 			{ "a", "a", "ā", "ă", "å", "à", "â" }, //
 			{ "Ae", "AE", "Æ", "Ǽ", "Ä" },//
 			{ "ae", "ae", "æ", "ǣ", "ä", "ǟ" },//
 			{ "C", "C", "Ċ", "Ç", "Č" },//
 			{ "c", "c", "ċ", "ç", "č" },//
 			{ "D", "D", "Ď", "Ð" },//
 			{ "d", "d", "ď", "đ" },//
 			{ "E", "E", "Ê", "Ë", "È", "É", "Ê" }, //
 			{ "e", "e", "ê", "ë", "è", "é" }, //
 			{ "G", "G", "Ĝ", "Ğ", "Ġ", "Ģ" },//
 			{ "g", "g", "ĝ", "ğ", "ġ", "ģ" },//
 			{ "I", "I", "Ì", "Í", "Î", "Ï", "Ĩ" }, //
 			{ "i", "i", "ĩ", "ì", "í", "î", "ï" },//
 			{ "N", "N", "Ñ" }, //
 			{ "n", "n", "ñ", },//
 			{ "O", "O", "Ø", "Ò", "Ó", "Ô", "Õ", "Ő", "Ǿ" },//
 			{ "o", "o", "ø", "ő", "ò", "ó", "ô", "õ", "ǿ" },//
 			{ "Oe", "OE", "Ö", "Œ" },//
 			{ "oe", "oe", "ö", "œ" },//
 			{ "ss", "ss", "ß" },//
 			{ "Aeffin", "AEffin", "Ä\uFB03n" },//
 			{ "IJ", "IJ", "Ĳ" },//
 			{ "ij", "ij", "ĳ" },//
 			{ "U", "U", "Û", "Ù", "Ú", "Ů" },//
 			{ "u", "u", "û", "ù", "ú", "ů" },//
 			{ "Ue", "UE", "Ü" },//
 			{ "ue", "ue", "ü" },//
 			{ "T", "T", "Ţ", "Ŧ" },//
 			{ "t", "t", "ţ", "ŧ" },//
 			{ "Y", "Y", "Ý" }, //
 			{ "y", "y", "ý", "ÿ" } //
 	};

 	/**
 	 * Implementation of an iterator that knows how to iterate over the source
 	 * test data array for upper- and lowercase mode.
 	 *
 	 * @author JHAUSHER
 	 */
 	private static final class TestDataIterator implements Iterator<Object[]> {
 		int dataIndex = 0;
 		int currentIndex = 2;

 		final boolean uppercase;

 		public TestDataIterator(boolean uppercase) {
 			this.uppercase = uppercase;
 		}

 		public boolean hasNext() {
 			return !(dataIndex == testDataSource.length && currentIndex != testDataSource[testDataSource.length - 1].length - 1);
 		}

 		public Object[] next() {
 			Object[] result = new Object[2];
 			int idx = currentIndex++;
 			result[0] = testDataSource[dataIndex][idx];
 			result[1] = (uppercase ? testDataSource[dataIndex][1] : testDataSource[dataIndex][0]);

 			if (currentIndex == testDataSource[dataIndex].length) {
 				currentIndex = 2;
 				dataIndex += 1;
 			}

 			return result;
 		}

 		public void remove() {
 			// ignore
 		}
 	}

 }
	package net.hausherr.util;

	import com.ibm.icu.text.Normalizer;

	import java.util.HashMap;
	import java.util.Map;

	/**
	* StringCleaner provides a method for normalizing a string to generally
	* ASCII-compatible form.
	*/
	public class StringCleaner {

	/**
	* Map containing replacements for corner cases (i.e. not decomposed by the
	* Normalizer)
	*/
	private Map<Integer, Replacement> charMap = buildReplacementMap();

	/**
	* builds a map containing all replacements that are not automatically
	* performed by the normalizer.
	*
	* @return Replacement containing both replacements for upper- and lowercase
	* mode.
	*/
	private Map<Integer, Replacement> buildReplacementMap() {
	Map<Integer, Replacement> map = new HashMap<Integer, Replacement>();
	map.put(0xc6, new Replacement("AE", "Ae"));
	map.put(0xe6, new Replacement("ae"));
	map.put(0xd0, new Replacement("D"));
	map.put(0x111, new Replacement("d"));
	map.put(0xd8, new Replacement("O"));
	map.put(0xf8, new Replacement("o"));
	map.put(0x152, new Replacement("OE", "Oe"));
	map.put(0x153, new Replacement("oe"));
	map.put(0x166, new Replacement("T"));
	map.put(0x167, new Replacement("t"));
	return map;
	}

	/**
	* <p>
	* This method takes an input String and replaces all special characters
	* like umlauts, accented or other letter with diacritical marks with their
	* basic ascii eqivalents.
	* </p>
	* <p>
	* Example: The String "André" or "Ándre" would be converted to "Andre".
	* </p>
	* <p>
	* The flag <code>replaceAllCapitalLetters</code> controls the replacement
	* behavior of special characters that are decomposed into two plain ASCII
	* chars, like "Æ" or "æ".
	* </p>
	* <p>
	* In "lowercase" mode (i.e.<code> replaceAllCapitalLetters=false</code> )
	* both aforementioned examples would be converted to "Ae".
	* </p>
	* <p>
	* In "uppercase" mode (<code>replaceAllCapitalLetters=false</code>) the
	* replacement would be "AE".
	* </p>
	*
	* @param input String to convert
	* @param replaceAllCapitalLetters <code>true</code> causes uppercase special chars that are
	* replaced by more than one character to be replaced by
	* all-uppercase replacements; <code>false</code> will cause only
	* the initial character of the replacements to be in uppercase
	* and all subsequent replacement characters will be in
	* lowercase.
	* @return Input string reduced to ASCII-safe characters.
	*/
	public String convertToAscii(String input, boolean replaceAllCapitalLetters) {
	/*
	* operating on char arrays because java.lang.String seems to perform an
	* automatic recomposition of decomposed characters.
	*/
	String result = null;
	if (null != input) {
	char[] src = input.toCharArray();
	/* save space for exotic UTF characters */
	char[] target = new char[src.length * 3];
	int len = Normalizer.normalize(input.toCharArray(), target, Normalizer.NFKD, 0);
	result = processSpecialChars(target, 0, len, replaceAllCapitalLetters);
	}
	return result;
	}

	@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = {"SF_SWITCH_FALLTHROUGH"}, justification = "Intentional fallthrough (JHAUSHER)")
	private String processSpecialChars(char[] target, int offset, int len, boolean uppercase) {
	StringBuilder result = new StringBuilder();
	boolean skip = false;

	for (int i = 0; i < len; i++) {
	if (skip) {
	skip = false;
	} else {
	char c = target[i];
	if ((c > 0x20 && c < 0x40) \|\| (c > 0x7a && c < 0xc0) \|\| (c > 0x5a && c < 0x61) \|\| (c > 0x79 && c < 0xc0) \|\| c == 0xd7 \|\| c == 0xf7) {
	result.append(c);
	} else if (Character.isDigit(c) \|\| Character.isISOControl(c)) {
	result.append(c);
	} else if (Character.isWhitespace(c) \|\| Character.isLetter(c)) {
	boolean isUpper = false;

	switch (c) {
	case '\u00df':
	result.append("ss");
	break;
	/* Handling of capital and lowercase umlauts */
	case 'A':
	case 'O':
	case 'U':
	isUpper = true;
	case 'a':
	case 'o':
	case 'u':
	result.append(c);
	if (i + 1 < target.length && target[i + 1] == 0x308) {
	result.append(isUpper && uppercase ? 'E' : 'e');
	skip = true;
	}
	break;
	default:
	Replacement rep = charMap.get(Integer.valueOf(c));
	if (rep != null) {
	result.append(uppercase ? rep.upper : rep.lower);
	} else
	result.append(c);
	}
	}
	}
	}

	return result.toString();
	}


	/**
	* Combination of replacements for upper- and lowercase mode.
	*/
	private static class Replacement {

	private final String upper;
	private final String lower;

	Replacement(String ucReplacement, String lcReplacement) {
	this.upper = ucReplacement;
	this.lower = lcReplacement;
	}

	Replacement(String caseInsensitiveReplacement) {
	this(caseInsensitiveReplacement, caseInsensitiveReplacement);
	}

	}
	}
	package net.hausherr.util;

	import org.testng.annotations.AfterClass;
	import org.testng.annotations.BeforeClass;
	import org.testng.annotations.DataProvider;
	import org.testng.annotations.Test;

	import java.lang.reflect.Method;
	import java.util.Iterator;

	import static org.testng.Assert.assertEquals;

	/**
	* Unit-Test for testing charachter replacement.
	*/
	public class StringCleanerTest {

	private StringCleaner sut;

	@BeforeClass
	public void setupClass() {
	sut = new StringCleaner();
	}

	@AfterClass
	public void tearDownClass() {

	}
	/**
	* Tests replacement of special characters in "lowercase" mode.
	*
	* @param testData
	* Char/String to test
	* @param expected
	* expected normalized form
	*/
	@Test(dataProvider = "asciiConversion")
	void testReplaceSpecialCharacters(String testData, String expected) {
	String result = sut.convertToAscii(testData, false);
	assertEquals(result, expected);
	}

	/**
	*
	* Tests replacement of special characters in "uppercase" mode.
	*
	* @param testData
	* Char/String to test
	* @param expected
	* expected normalized form
	*/
	@Test(dataProvider = "asciiConversion")
	void testReplaceSpecialCharactersUppercase(String testData, String expected) {
	String result = sut.convertToAscii(testData, true);
	assertEquals(result, expected);
	}

	/**
	* Data provider for the test methods either for upper- or lowercase mode..
	*
	* Provides Data both for upper- and lowercase tests as Iterator over the
	* array of String arrays that holds the "raw" data.
	*
	* @param m
	* actual testmethod, provided by TestNG
	* @return Iterator over test data.
	*/
	@DataProvider(name = "asciiConversion")
	public Iterator<Object[]> dataProvider(Method m) {

	if (m.getName().endsWith("Uppercase")) {
	return new TestDataIterator(true);
	}

	return new TestDataIterator(false);

	}

	/**
	* This array of String Arrays holds the data for all tests.
	*
	* <p>
	* The data contained in each array has the following semantics:
	* </p>
	* <code>{ "EXP(lc)", "EXP(uc)", "TV_1", ... , "TV-n" }</code>
	* <p>
	* Legend:
	* </p>
	* <ul>
	* <li>EXP(lc): Reference value for lowercase tests (expectation)</li>
	* <li>EXP(uc): Reference value for uppercase tests (expectation)</li>
	* <li>TV_1...TV_n: Test String/Character 1...n</li>
	* </ul>
	*/
	private static final String[][] testDataSource = {
	//
	/* Sanity checks first */
	{ "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ" },//
	{ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz" },//
	{ "1234567890", "1234567890", "1234567890" },//
	/* Symbols */
	{ "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\" },//
	{ "÷×¡¢£¤·", "÷×¡¢£¤·", "÷×¡¢£¤·" }, //
	{ " \t\r\n", " \t\r\n", " \t\r\n" },//
	/* Fun starts here */
	{ "A", "A", "Ā", "Ă", "Å", "À", "Â" }, //
	{ "a", "a", "ā", "ă", "å", "à", "â" }, //
	{ "Ae", "AE", "Æ", "Ǽ", "Ä" },//
	{ "ae", "ae", "æ", "ǣ", "ä", "ǟ" },//
	{ "C", "C", "Ċ", "Ç", "Č" },//
	{ "c", "c", "ċ", "ç", "č" },//
	{ "D", "D", "Ď", "Ð" },//
	{ "d", "d", "ď", "đ" },//
	{ "E", "E", "Ê", "Ë", "È", "É", "Ê" }, //
	{ "e", "e", "ê", "ë", "è", "é" }, //
	{ "G", "G", "Ĝ", "Ğ", "Ġ", "Ģ" },//
	{ "g", "g", "ĝ", "ğ", "ġ", "ģ" },//
	{ "I", "I", "Ì", "Í", "Î", "Ï", "Ĩ" }, //
	{ "i", "i", "ĩ", "ì", "í", "î", "ï" },//
	{ "N", "N", "Ñ" }, //
	{ "n", "n", "ñ", },//
	{ "O", "O", "Ø", "Ò", "Ó", "Ô", "Õ", "Ő", "Ǿ" },//
	{ "o", "o", "ø", "ő", "ò", "ó", "ô", "õ", "ǿ" },//
	{ "Oe", "OE", "Ö", "Œ" },//
	{ "oe", "oe", "ö", "œ" },//
	{ "ss", "ss", "ß" },//
	{ "Aeffin", "AEffin", "Ä\uFB03n" },//
	{ "IJ", "IJ", "Ĳ" },//
	{ "ij", "ij", "ĳ" },//
	{ "U", "U", "Û", "Ù", "Ú", "Ů" },//
	{ "u", "u", "û", "ù", "ú", "ů" },//
	{ "Ue", "UE", "Ü" },//
	{ "ue", "ue", "ü" },//
	{ "T", "T", "Ţ", "Ŧ" },//
	{ "t", "t", "ţ", "ŧ" },//
	{ "Y", "Y", "Ý" }, //
	{ "y", "y", "ý", "ÿ" } //
	};

	/**
	* Implementation of an iterator that knows how to iterate over the source
	* test data array for upper- and lowercase mode.
	*
	* @author JHAUSHER
	*/
	private static final class TestDataIterator implements Iterator<Object[]> {
	int dataIndex = 0;
	int currentIndex = 2;

	final boolean uppercase;

	public TestDataIterator(boolean uppercase) {
	this.uppercase = uppercase;
	}

	public boolean hasNext() {
	return !(dataIndex == testDataSource.length && currentIndex != testDataSource[testDataSource.length - 1].length - 1);
	}

	public Object[] next() {
	Object[] result = new Object[2];
	int idx = currentIndex++;
	result[0] = testDataSource[dataIndex][idx];
	result[1] = (uppercase ? testDataSource[dataIndex][1] : testDataSource[dataIndex][0]);

	if (currentIndex == testDataSource[dataIndex].length) {
	currentIndex = 2;
	dataIndex += 1;
	}

	return result;
	}

	public void remove() {
	// ignore
	}
	}

	}