Skip to content

Instantly share code, notes, and snippets.

@hkurokawa
Last active November 17, 2018 10:04
Show Gist options
  • Save hkurokawa/fad9918bbe0a6476fc1d03bcc6a6f908 to your computer and use it in GitHub Desktop.
Save hkurokawa/fad9918bbe0a6476fc1d03bcc6a6f908 to your computer and use it in GitHub Desktop.
UTF8 encode/decoder in Kotlin
import java.io.ByteArrayOutputStream
@ExperimentalUnsignedTypes
inline class Unicode(val point: Int)
@ExperimentalUnsignedTypes
object UTF8Encoder {
fun encode(unicodes: Array<Unicode>): ByteArray {
val array = ByteArrayOutputStream()
for (code in unicodes) {
val point = code.point
when {
point < 0x0080 -> { // 7-bit
array.write(byteArrayOf(point.toByte()))
}
point < 0x0800 -> { // 11-bit
array.write(
byteArrayOf(
(point shr 6 or 0xc0).toByte(), // 110yyyyx
(point and 0x3f or 0x80).toByte() // 10xxxxxx
)
)
}
point < 0x10000 -> { // 16-bit
array.write(
byteArrayOf(
(point shr 12 or 0xe0).toByte(), // 1110yyyy
(point shr 6 and 0x3f or 0x80).toByte(), // 10yxxxxx
(point and 0x3f or 0x80).toByte() // 10xxxxxx
)
)
}
else -> {
array.write(
byteArrayOf( // 21-bit
(point shr 18 or 0xf0).toByte(), // 11110yyy
(point shr 12 and 0x3f or 0x80).toByte(), // 10yyxxxx
(point shr 6 and 0x3f or 0x80).toByte(), // 10xxxxxx
(point and 0x3f or 0x80).toByte() // 10xxxxxx
)
)
}
}
}
return array.toByteArray()
}
// The following method takes advantage of using XOR on 2's complement store numbers.
// See https://github.com/square/okio/blob/bbb29c459e5ccf0f286e0b17ccdcacd7ac4bc2a9/okio/src/main/kotlin/okio/Utf8.kt#L302
private const val MASK_2BYTES = 0x0f80
// MASK_2BYTES =
// (0xc0.toByte() shl 6) xor
// (0x80.toByte().toInt())
private const val MASK_3BYTES = -0x01e080
// MASK_3BYTES =
// (0xe0.toByte() shl 12) xor
// (0x80.toByte() shl 6) xor
// (0x80.toByte().toInt())
private const val MASK_4BYTES = 0x381f80
// MASK_4BYTES =
// (0xf0.toByte() shl 18) xor
// (0x80.toByte() shl 12) xor
// (0x80.toByte() shl 6) xor
// (0x80.toByte().toInt())
fun decode(bytes: ByteArray): Array<Unicode> {
val res = mutableListOf<Unicode>()
var index = 0
fun next(): Int {
if (index >= bytes.size) {
throw IllegalArgumentException(
"The byte code interrupts in the middle of the encoding bytes.")
}
return bytes[index++].toInt()
}
while (index < bytes.size) {
val b0 = next()
when {
b0 shr 7 == 0 -> {
res.add(Unicode(b0))
}
b0 shr 5 == 0b110 -> {
val b1 = next()
res.add(Unicode(MASK_2BYTES xor (b0 shl 6) xor b1))
}
b0 shr 4 and 0x0f == 0b1110 -> {
val b1 = next()
val b2 = next()
res.add(Unicode(MASK_3BYTES xor (b0 shl 12) xor (b1 shl 6) xor b2))
}
b0 shr 3 and 0x1f == 0b11110 -> {
val b1 = next()
val b2 = next()
val b3 = next()
res.add(Unicode(MASK_4BYTES xor (b0 shl 18) xor (b1 shl 12) xor (b2 shl 6) xor b3)
)
}
else -> throw IllegalArgumentException("Unexpected leading byte at $index: $b0")
}
}
return res.toTypedArray()
}
}
import org.junit.jupiter.api.Assertions.assertArrayEquals
import org.junit.jupiter.api.Test
@ExperimentalUnsignedTypes
internal class UTF8EncoderTest {
// \uD83D\uDE00 represents '😀' See https://www.fileformat.info/info/unicode/char/1f600/index.htm
private val helloWorldUnicodePoints = "Hello, 世界 \uD83D\uDE00".toUnicodePoints()
private val helloWorldBytes = byteArrayOf(
0x48,
0x65,
0x6c,
0x6c,
0x6f,
0x2c,
0x20,
0xe4,
0xb8,
0x96,
0xe7,
0x95,
0x8c,
0x20,
0xf0,
0x9f,
0x98,
0x80
)
@Test
fun testEncode() {
assertArrayEquals(helloWorldBytes, UTF8Encoder.encode(helloWorldUnicodePoints))
}
@Test
fun testDecode() {
assertArrayEquals(helloWorldUnicodePoints, UTF8Encoder.decode(helloWorldBytes))
}
@Test
fun testEncodeDecode() {
val s = "こんにちは世界 Hello,world! \uD83D\uDE00".toUnicodePoints()
assertArrayEquals(s, UTF8Encoder.decode(UTF8Encoder.encode(s)))
}
}
private fun byteArrayOf(vararg ints: Int): ByteArray {
val array = ByteArray(ints.size)
ints.forEachIndexed { idx, value -> array[idx] = value.toByte() }
return array
}
@ExperimentalUnsignedTypes
private fun String.toUnicodePoints(): Array<Unicode> {
val array = mutableListOf<Unicode>()
var high: Char? = null
forEachIndexed { idx, ch ->
when {
ch.isHighSurrogate() -> {
if (high != null) {
throw IllegalStateException(
"Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx"
)
}
high = ch
}
ch.isLowSurrogate() -> {
if (high == null) {
throw IllegalStateException(
"Unexpected lower surrogate pair: ${ch.toByte()} at $idx"
)
}
array.add(Unicode(Character.toCodePoint(high!!, ch)))
high = null
}
else -> {
if (high != null) {
throw IllegalStateException(
"Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx"
)
}
array.add(Unicode(ch.toInt()))
}
}
}
if (high != null) {
throw IllegalStateException(
"High-surrogate char at the end of the string: ${high!!.toByte()}"
)
}
return array.toTypedArray()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment