hkurokawa · November 17, 2018 10:04
diff --git a/UTF8Encoder.kt b/UTF8Encoder.kt
 import java.io.ByteArrayOutputStream

 @ExperimentalUnsignedTypes
 inline class Unicode(val point: Int)

 @ExperimentalUnsignedTypes
 object UTF8Encoder {
  fun encode(unicodes: Array<Unicode>): ByteArray {
    val array = ByteArrayOutputStream()
    for (code in unicodes) {
      val point = code.point
      when {
        point < 0x0080 -> { // 7-bit
          array.write(byteArrayOf(point.toByte()))
        }
        point < 0x0800 -> { // 11-bit
          array.write(
              byteArrayOf(
                  (point shr 6 or 0xc0).toByte(), // 110yyyyx
                  (point and 0x3f or 0x80).toByte() // 10xxxxxx
              )
          )
        }
        point < 0x10000 -> { // 16-bit
          array.write(
              byteArrayOf(
                  (point shr 12 or 0xe0).toByte(), // 1110yyyy
                  (point shr 6 and 0x3f or 0x80).toByte(), // 10yxxxxx
                  (point and 0x3f or 0x80).toByte() // 10xxxxxx
              )
          )
        }
        else -> {
          array.write(
              byteArrayOf( // 21-bit
                  (point shr 18 or 0xf0).toByte(), // 11110yyy
                  (point shr 12 and 0x3f or 0x80).toByte(), // 10yyxxxx
                  (point shr 6 and 0x3f or 0x80).toByte(), // 10xxxxxx
                  (point and 0x3f or 0x80).toByte() // 10xxxxxx
              )
          )
        }
      }
    }
    return array.toByteArray()
  }

  // The following method takes advantage of using XOR on 2's complement store numbers.
  // See https://github.com/square/okio/blob/bbb29c459e5ccf0f286e0b17ccdcacd7ac4bc2a9/okio/src/main/kotlin/okio/Utf8.kt#L302
  private const val MASK_2BYTES = 0x0f80
  // MASK_2BYTES =
  //    (0xc0.toByte() shl 6) xor
  //    (0x80.toByte().toInt())

  private const val MASK_3BYTES = -0x01e080
  // MASK_3BYTES =
  //    (0xe0.toByte() shl 12) xor
  //    (0x80.toByte() shl 6) xor
  //    (0x80.toByte().toInt())

  private const val MASK_4BYTES = 0x381f80
  // MASK_4BYTES =
  //    (0xf0.toByte() shl 18) xor
  //    (0x80.toByte() shl 12) xor
  //    (0x80.toByte() shl 6) xor
  //    (0x80.toByte().toInt())

  fun decode(bytes: ByteArray): Array<Unicode> {
    val res = mutableListOf<Unicode>()
    var index = 0
    fun next(): Int {
      if (index >= bytes.size) {
        throw IllegalArgumentException(
            "The byte code interrupts in the middle of the encoding bytes.")
      }
      return bytes[index++].toInt()
    }
    while (index < bytes.size) {
      val b0 = next()
      when {
        b0 shr 7 == 0 -> {
          res.add(Unicode(b0))
        }
        b0 shr 5 == 0b110 -> {
          val b1 = next()
          res.add(Unicode(MASK_2BYTES xor (b0 shl 6) xor b1))
        }
        b0 shr 4 and 0x0f == 0b1110 -> {
          val b1 = next()
          val b2 = next()
          res.add(Unicode(MASK_3BYTES xor (b0 shl 12) xor (b1 shl 6) xor b2))
        }
        b0 shr 3 and 0x1f == 0b11110 -> {
          val b1 = next()
          val b2 = next()
          val b3 = next()
          res.add(Unicode(MASK_4BYTES xor (b0 shl 18) xor (b1 shl 12) xor (b2 shl 6) xor b3)
          )
        }
        else -> throw IllegalArgumentException("Unexpected leading byte at $index: $b0")
      }
    }
    return res.toTypedArray()
  }
 }
diff --git a/UTF8EncoderTest.kt b/UTF8EncoderTest.kt
 import org.junit.jupiter.api.Assertions.assertArrayEquals
 import org.junit.jupiter.api.Test

 @ExperimentalUnsignedTypes
 internal class UTF8EncoderTest {
  // \uD83D\uDE00 represents '😀' See https://www.fileformat.info/info/unicode/char/1f600/index.htm
  private val helloWorldUnicodePoints = "Hello, 世界 \uD83D\uDE00".toUnicodePoints()
  private val helloWorldBytes = byteArrayOf(
      0x48,
      0x65,
      0x6c,
      0x6c,
      0x6f,
      0x2c,
      0x20,
      0xe4,
      0xb8,
      0x96,
      0xe7,
      0x95,
      0x8c,
      0x20,
      0xf0,
      0x9f,
      0x98,
      0x80
  )

  @Test
  fun testEncode() {
    assertArrayEquals(helloWorldBytes, UTF8Encoder.encode(helloWorldUnicodePoints))
  }

  @Test
  fun testDecode() {
    assertArrayEquals(helloWorldUnicodePoints, UTF8Encoder.decode(helloWorldBytes))
  }

  @Test
  fun testEncodeDecode() {
    val s = "こんにちは世界 Hello,world! \uD83D\uDE00".toUnicodePoints()
    assertArrayEquals(s, UTF8Encoder.decode(UTF8Encoder.encode(s)))
  }
 }

 private fun byteArrayOf(vararg ints: Int): ByteArray {
  val array = ByteArray(ints.size)
  ints.forEachIndexed { idx, value -> array[idx] = value.toByte() }
  return array
 }

 @ExperimentalUnsignedTypes
 private fun String.toUnicodePoints(): Array<Unicode> {
  val array = mutableListOf<Unicode>()
  var high: Char? = null
  forEachIndexed { idx, ch ->
    when {
      ch.isHighSurrogate() -> {
        if (high != null) {
          throw IllegalStateException(
              "Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx"
          )
        }
        high = ch
      }
      ch.isLowSurrogate() -> {
        if (high == null) {
          throw IllegalStateException(
              "Unexpected lower surrogate pair: ${ch.toByte()} at $idx"
          )
        }
        array.add(Unicode(Character.toCodePoint(high!!, ch)))
        high = null
      }
      else -> {
        if (high != null) {
          throw IllegalStateException(
              "Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx"
          )
        }
        array.add(Unicode(ch.toInt()))
      }
    }
  }
  if (high != null) {
    throw IllegalStateException(
        "High-surrogate char at the end of the string: ${high!!.toByte()}"
    )
  }
  return array.toTypedArray()
 }
	import java.io.ByteArrayOutputStream

	@ExperimentalUnsignedTypes
	inline class Unicode(val point: Int)

	@ExperimentalUnsignedTypes
	object UTF8Encoder {
	fun encode(unicodes: Array<Unicode>): ByteArray {
	val array = ByteArrayOutputStream()
	for (code in unicodes) {
	val point = code.point
	when {
	point < 0x0080 -> { // 7-bit
	array.write(byteArrayOf(point.toByte()))
	}
	point < 0x0800 -> { // 11-bit
	array.write(
	byteArrayOf(
	(point shr 6 or 0xc0).toByte(), // 110yyyyx
	(point and 0x3f or 0x80).toByte() // 10xxxxxx
	)
	)
	}
	point < 0x10000 -> { // 16-bit
	array.write(
	byteArrayOf(
	(point shr 12 or 0xe0).toByte(), // 1110yyyy
	(point shr 6 and 0x3f or 0x80).toByte(), // 10yxxxxx
	(point and 0x3f or 0x80).toByte() // 10xxxxxx
	)
	)
	}
	else -> {
	array.write(
	byteArrayOf( // 21-bit
	(point shr 18 or 0xf0).toByte(), // 11110yyy
	(point shr 12 and 0x3f or 0x80).toByte(), // 10yyxxxx
	(point shr 6 and 0x3f or 0x80).toByte(), // 10xxxxxx
	(point and 0x3f or 0x80).toByte() // 10xxxxxx
	)
	)
	}
	}
	}
	return array.toByteArray()
	}

	// The following method takes advantage of using XOR on 2's complement store numbers.
	// See https://github.com/square/okio/blob/bbb29c459e5ccf0f286e0b17ccdcacd7ac4bc2a9/okio/src/main/kotlin/okio/Utf8.kt#L302
	private const val MASK_2BYTES = 0x0f80
	// MASK_2BYTES =
	// (0xc0.toByte() shl 6) xor
	// (0x80.toByte().toInt())

	private const val MASK_3BYTES = -0x01e080
	// MASK_3BYTES =
	// (0xe0.toByte() shl 12) xor
	// (0x80.toByte() shl 6) xor
	// (0x80.toByte().toInt())

	private const val MASK_4BYTES = 0x381f80
	// MASK_4BYTES =
	// (0xf0.toByte() shl 18) xor
	// (0x80.toByte() shl 12) xor
	// (0x80.toByte() shl 6) xor
	// (0x80.toByte().toInt())

	fun decode(bytes: ByteArray): Array<Unicode> {
	val res = mutableListOf<Unicode>()
	var index = 0
	fun next(): Int {
	if (index >= bytes.size) {
	throw IllegalArgumentException(
	"The byte code interrupts in the middle of the encoding bytes.")
	}
	return bytes[index++].toInt()
	}
	while (index < bytes.size) {
	val b0 = next()
	when {
	b0 shr 7 == 0 -> {
	res.add(Unicode(b0))
	}
	b0 shr 5 == 0b110 -> {
	val b1 = next()
	res.add(Unicode(MASK_2BYTES xor (b0 shl 6) xor b1))
	}
	b0 shr 4 and 0x0f == 0b1110 -> {
	val b1 = next()
	val b2 = next()
	res.add(Unicode(MASK_3BYTES xor (b0 shl 12) xor (b1 shl 6) xor b2))
	}
	b0 shr 3 and 0x1f == 0b11110 -> {
	val b1 = next()
	val b2 = next()
	val b3 = next()
	res.add(Unicode(MASK_4BYTES xor (b0 shl 18) xor (b1 shl 12) xor (b2 shl 6) xor b3)
	)
	}
	else -> throw IllegalArgumentException("Unexpected leading byte at $index: $b0")
	}
	}
	return res.toTypedArray()
	}
	}
	import org.junit.jupiter.api.Assertions.assertArrayEquals
	import org.junit.jupiter.api.Test

	@ExperimentalUnsignedTypes
	internal class UTF8EncoderTest {
	// \uD83D\uDE00 represents '😀' See https://www.fileformat.info/info/unicode/char/1f600/index.htm
	private val helloWorldUnicodePoints = "Hello, 世界 \uD83D\uDE00".toUnicodePoints()
	private val helloWorldBytes = byteArrayOf(
	0x48,
	0x65,
	0x6c,
	0x6c,
	0x6f,
	0x2c,
	0x20,
	0xe4,
	0xb8,
	0x96,
	0xe7,
	0x95,
	0x8c,
	0x20,
	0xf0,
	0x9f,
	0x98,
	0x80
	)

	@Test
	fun testEncode() {
	assertArrayEquals(helloWorldBytes, UTF8Encoder.encode(helloWorldUnicodePoints))
	}

	@Test
	fun testDecode() {
	assertArrayEquals(helloWorldUnicodePoints, UTF8Encoder.decode(helloWorldBytes))
	}

	@Test
	fun testEncodeDecode() {
	val s = "こんにちは世界 Hello,world! \uD83D\uDE00".toUnicodePoints()
	assertArrayEquals(s, UTF8Encoder.decode(UTF8Encoder.encode(s)))
	}
	}

	private fun byteArrayOf(vararg ints: Int): ByteArray {
	val array = ByteArray(ints.size)
	ints.forEachIndexed { idx, value -> array[idx] = value.toByte() }
	return array
	}

	@ExperimentalUnsignedTypes
	private fun String.toUnicodePoints(): Array<Unicode> {
	val array = mutableListOf<Unicode>()
	var high: Char? = null
	forEachIndexed { idx, ch ->
	when {
	ch.isHighSurrogate() -> {
	if (high != null) {
	throw IllegalStateException(
	"Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx"
	)
	}
	high = ch
	}
	ch.isLowSurrogate() -> {
	if (high == null) {
	throw IllegalStateException(
	"Unexpected lower surrogate pair: ${ch.toByte()} at $idx"
	)
	}
	array.add(Unicode(Character.toCodePoint(high!!, ch)))
	high = null
	}
	else -> {
	if (high != null) {
	throw IllegalStateException(
	"Expected surrogate pair but is not: [${high!!.toByte()}, ${ch.toByte()}] at $idx"
	)
	}
	array.add(Unicode(ch.toInt()))
	}
	}
	}
	if (high != null) {
	throw IllegalStateException(
	"High-surrogate char at the end of the string: ${high!!.toByte()}"
	)
	}
	return array.toTypedArray()
	}