Last active
November 17, 2018 10:04
-
-
Save hkurokawa/fad9918bbe0a6476fc1d03bcc6a6f908 to your computer and use it in GitHub Desktop.
UTF8 encode/decoder in Kotlin
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.ByteArrayOutputStream | |
@ExperimentalUnsignedTypes | |
inline class Unicode(val point: Int) | |
@ExperimentalUnsignedTypes | |
object UTF8Encoder { | |
fun encode(unicodes: Array<Unicode>): ByteArray { | |
val array = ByteArrayOutputStream() | |
for (code in unicodes) { | |
val point = code.point | |
when { | |
point < 0x0080 -> { // 7-bit | |
array.write(byteArrayOf(point.toByte())) | |
} | |
point < 0x0800 -> { // 11-bit | |
array.write( | |
byteArrayOf( | |
(point shr 6 or 0xc0).toByte(), // 110yyyyx | |
(point and 0x3f or 0x80).toByte() // 10xxxxxx | |
) | |
) | |
} | |
point < 0x10000 -> { // 16-bit | |
array.write( | |
byteArrayOf( | |
(point shr 12 or 0xe0).toByte(), // 1110yyyy | |
(point shr 6 and 0x3f or 0x80).toByte(), // 10yxxxxx | |
(point and 0x3f or 0x80).toByte() // 10xxxxxx | |
) | |
) | |
} | |
else -> { | |
array.write( | |
byteArrayOf( // 21-bit | |
(point shr 18 or 0xf0).toByte(), // 11110yyy | |
(point shr 12 and 0x3f or 0x80).toByte(), // 10yyxxxx | |
(point shr 6 and 0x3f or 0x80).toByte(), // 10xxxxxx | |
(point and 0x3f or 0x80).toByte() // 10xxxxxx | |
) | |
) | |
} | |
} | |
} | |
return array.toByteArray() | |
} | |
// The following method takes advantage of using XOR on 2's complement store numbers. | |
// See https://github.com/square/okio/blob/bbb29c459e5ccf0f286e0b17ccdcacd7ac4bc2a9/okio/src/main/kotlin/okio/Utf8.kt#L302 | |
private const val MASK_2BYTES = 0x0f80 | |
// MASK_2BYTES = | |
// (0xc0.toByte() shl 6) xor | |
// (0x80.toByte().toInt()) | |
private const val MASK_3BYTES = -0x01e080 | |
// MASK_3BYTES = | |
// (0xe0.toByte() shl 12) xor | |
// (0x80.toByte() shl 6) xor | |
// (0x80.toByte().toInt()) | |
private const val MASK_4BYTES = 0x381f80 | |
// MASK_4BYTES = | |
// (0xf0.toByte() shl 18) xor | |
// (0x80.toByte() shl 12) xor | |
// (0x80.toByte() shl 6) xor | |
// (0x80.toByte().toInt()) | |
fun decode(bytes: ByteArray): Array<Unicode> { | |
val res = mutableListOf<Unicode>() | |
var index = 0 | |
fun next(): Int { | |
if (index >= bytes.size) { | |
throw IllegalArgumentException( | |
"The byte code interrupts in the middle of the encoding bytes.") | |
} | |
return bytes[index++].toInt() | |
} | |
while (index < bytes.size) { | |
val b0 = next() | |
when { | |
b0 shr 7 == 0 -> { | |
res.add(Unicode(b0)) | |
} | |
b0 shr 5 == 0b110 -> { | |
val b1 = next() | |
res.add(Unicode(MASK_2BYTES xor (b0 shl 6) xor b1)) | |
} | |
b0 shr 4 and 0x0f == 0b1110 -> { | |
val b1 = next() | |
val b2 = next() | |
res.add(Unicode(MASK_3BYTES xor (b0 shl 12) xor (b1 shl 6) xor b2)) | |
} | |
b0 shr 3 and 0x1f == 0b11110 -> { | |
val b1 = next() | |
val b2 = next() | |
val b3 = next() | |
res.add(Unicode(MASK_4BYTES xor (b0 shl 18) xor (b1 shl 12) xor (b2 shl 6) xor b3) | |
) | |
} | |
else -> throw IllegalArgumentException("Unexpected leading byte at $index: $b0") | |
} | |
} | |
return res.toTypedArray() | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.junit.jupiter.api.Assertions.assertArrayEquals | |
import org.junit.jupiter.api.Test | |
@ExperimentalUnsignedTypes | |
internal class UTF8EncoderTest { | |
// \uD83D\uDE00 represents '😀' See https://www.fileformat.info/info/unicode/char/1f600/index.htm | |
private val helloWorldUnicodePoints = "Hello, 世界 \uD83D\uDE00".toUnicodePoints() | |
private val helloWorldBytes = byteArrayOf( | |
0x48, | |
0x65, | |
0x6c, | |
0x6c, | |
0x6f, | |
0x2c, | |
0x20, | |
0xe4, | |
0xb8, | |
0x96, | |
0xe7, | |
0x95, | |
0x8c, | |
0x20, | |
0xf0, | |
0x9f, | |
0x98, | |
0x80 | |
) | |
@Test | |
fun testEncode() { | |
assertArrayEquals(helloWorldBytes, UTF8Encoder.encode(helloWorldUnicodePoints)) | |
} | |
@Test | |
fun testDecode() { | |
assertArrayEquals(helloWorldUnicodePoints, UTF8Encoder.decode(helloWorldBytes)) | |
} | |
@Test | |
fun testEncodeDecode() { | |
val s = "こんにちは世界 Hello,world! \uD83D\uDE00".toUnicodePoints() | |
assertArrayEquals(s, UTF8Encoder.decode(UTF8Encoder.encode(s))) | |
} | |
} | |
private fun byteArrayOf(vararg ints: Int): ByteArray { | |
val array = ByteArray(ints.size) | |
ints.forEachIndexed { idx, value -> array[idx] = value.toByte() } | |
return array | |
} | |
@ExperimentalUnsignedTypes | |
private fun String.toUnicodePoints(): Array<Unicode> { | |
val array = mutableListOf<Unicode>() | |
var high: Char? = null | |
forEachIndexed { idx, ch -> | |
when { | |
ch.isHighSurrogate() -> { | |
if (high != null) { | |
throw IllegalStateException( | |
"Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx" | |
) | |
} | |
high = ch | |
} | |
ch.isLowSurrogate() -> { | |
if (high == null) { | |
throw IllegalStateException( | |
"Unexpected lower surrogate pair: ${ch.toByte()} at $idx" | |
) | |
} | |
array.add(Unicode(Character.toCodePoint(high!!, ch))) | |
high = null | |
} | |
else -> { | |
if (high != null) { | |
throw IllegalStateException( | |
"Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx" | |
) | |
} | |
array.add(Unicode(ch.toInt())) | |
} | |
} | |
} | |
if (high != null) { | |
throw IllegalStateException( | |
"High-surrogate char at the end of the string: ${high!!.toByte()}" | |
) | |
} | |
return array.toTypedArray() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment