Last active
August 29, 2015 14:11
-
-
Save aembleton/c3495438eac9ea19fff6 to your computer and use it in GitHub Desktop.
Translates between ASCII and unicode.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.annotation.tailrec | |
object UnicodeTranslator { | |
private lazy val unicodeRegex = """(\\u[0-9a-fA-F]{4})""".r | |
def escapeUnicode(stringToEscape:String) = stringToEscape.foldLeft("") { (escaped, char) => | |
if (char <= 127) { | |
escaped + char | |
} else { | |
escaped + charToHex(char) | |
} | |
} | |
def unescapeUnicode(stringToRemoveEscapesFrom:String):String = unescapeUnicode(stringToRemoveEscapesFrom, "") | |
@tailrec | |
private def unescapeUnicode(remainder:String, unescaped:String):String = remainder.take(6) match { | |
case "" => unescaped; | |
case unicodeRegex(hex) => unescapeUnicode(remainder.drop(6), unescaped + hexToChar(hex)) | |
case str => unescapeUnicode(remainder.tail, unescaped + str.head) | |
} | |
private def charToHex(c: Char) = { | |
val hex = Integer.toHexString(c).toUpperCase.reverse.padTo(4, '0').reverse | |
s"\\u$hex" | |
} | |
private def hexToChar(hex: String) = { | |
val unicode = hex.drop(2) | |
Integer.parseInt(unicode, 16).toChar | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.junit.runner.RunWith | |
import org.specs2.mutable.Specification | |
import org.specs2.runner.JUnitRunner | |
@RunWith(classOf[JUnitRunner]) | |
class UnicodeTranslatorTest extends Specification { | |
"escaping unicode" should { | |
"not make any modifications for ASCII strings" in { | |
UnicodeTranslator.escapeUnicode("Hello World") mustEqual "Hello World" | |
} | |
"replace a single character with it's unicode equivalent" in { | |
UnicodeTranslator.escapeUnicode("ò") mustEqual "\\u00F2" | |
} | |
"replace the accented characters in a String" in { | |
UnicodeTranslator.escapeUnicode("Bòrd na Gàidhlig") mustEqual "B\\u00F2rd na G\\u00E0idhlig" | |
} | |
} | |
"unescaping unicode" should { | |
"not make any modifications for ASCII strings" in { | |
UnicodeTranslator.unescapeUnicode("Hello World") mustEqual "Hello World" | |
} | |
"replace a unicode with it's equivalent character" in { | |
UnicodeTranslator.unescapeUnicode("\\u00F2") mustEqual "ò" | |
} | |
"replace unicodes in a string with the equivalent accented characters" in { | |
UnicodeTranslator.unescapeUnicode("B\\u00F2rd na G\\u00E0idhlig") mustEqual "Bòrd na Gàidhlig" | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment