Skip to content

Instantly share code, notes, and snippets.

@andresilva
Created July 6, 2015 17:17
Show Gist options
  • Save andresilva/c19c1ab03d46fc9bedc4 to your computer and use it in GitHub Desktop.
Save andresilva/c19c1ab03d46fc9bedc4 to your computer and use it in GitHub Desktop.
JVM string decoding performance
package benchmark
import java.nio._
import java.util.concurrent.TimeUnit
import org.openjdk.jmh.annotations._
object Random {
def random = new scala.util.Random(0)
def nextString(size: Int) = random.nextString(size)
def nextAsciiString(size: Int) = {
val r = random
Array.fill[Char](100000)(r.nextPrintableChar).mkString
}
}
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
class UTF8StringBenchmark {
@Param(Array("10", "100", "1000", "10000", "100000"))
var size: Int = _
@Param(Array("true", "false"))
var ascii: Boolean = _
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size)
val utf8Bytes = string.getBytes("UTF-8")
@Benchmark
def decode(): String = {
new String(utf8Bytes, 0, utf8Bytes.length, "UTF-8")
}
}
object UTF8DFA {
final val UTF8_ACCEPT = 0
final val UTF8_REJECT = 12
final val utf8d: Array[Byte] = Array(
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12,
12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12)
def decode(in: Array[Byte], offset: Int, length: Int, out: Array[Char]): Int = {
val end = offset + length
var i = offset
var o = 0
var state = UTF8_ACCEPT
var codepoint = 0
while (i < end) {
val b = in(i) & 0xff; i += 1
if (state != UTF8_ACCEPT) {
val ttype = utf8d(b)
codepoint = (codepoint << 6) | (b & 63)
state = utf8d(256 + state + ttype)
if (state == UTF8_ACCEPT) {
if (codepoint > 0xffff) {
out(o) = (0xd7c0 + (codepoint >> 10)).toChar; o += 1;
out(o) = (0xdc00 + (codepoint & 0x3ff)).toChar; o += 1;
} else {
out(o) = codepoint.toChar; o += 1
}
}
} else if (b > 0x7f) {
val ttype = utf8d(b)
codepoint = b & (255 >> ttype)
state = utf8d(256 + ttype)
} else {
out(o) = b.toChar; o += 1;
}
}
if (state != UTF8_ACCEPT) throw new Exception()
o
}
def decode(in: Array[Byte], offset: Int, length: Int): String = {
val ret = new Array[Char](in.length)
val size = decode(in, offset, length, ret)
new String(ret, 0, size)
}
def decode(in: Array[Byte]): String = decode(in, 0, in.length)
}
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
class UTF8DFAStringBenchmark {
@Param(Array("10", "100", "1000", "10000", "100000"))
var size: Int = _
@Param(Array("true", "false"))
var ascii: Boolean = _
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size)
val utf8Bytes = string.getBytes("UTF-8")
@Benchmark
def decode(): String = {
UTF8DFA.decode(utf8Bytes)
}
}
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
class UTF16StringBenchmark {
@Param(Array("10", "100", "1000", "10000", "100000"))
var size: Int = _
@Param(Array("true", "false"))
var ascii: Boolean = _
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size)
val utf16Bytes = string.getBytes("UTF-16")
@Benchmark
def decode(): String = {
new String(utf16Bytes, 0, utf16Bytes.length, "UTF-16")
}
}
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
class ASCIIStringBenchmark {
@Param(Array("10", "100", "1000", "10000", "100000"))
var size: Int = _
@Param(Array("true", "false"))
var ascii: Boolean = _
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size)
val asciiBytes = string.getBytes("ASCII")
@Benchmark
def decode(): String = {
new String(asciiBytes, 0, asciiBytes.length, "ASCII")
}
}
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
class ASCIIDirectStringBenchmark {
@Param(Array("10", "100", "1000", "10000", "100000"))
var size: Int = _
@Param(Array("true", "false"))
var ascii: Boolean = _
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size)
val asciiBytes = string.getBytes("ASCII")
@Benchmark
def decode(): String = {
val buffer = new Array[Char](asciiBytes.length)
var i = 0
while (i < buffer.length) {
buffer(i) = asciiBytes(i).toChar
i += 1
}
new String(buffer)
}
}
@BenchmarkMode(Array(Mode.Throughput))
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
class UTF16StringDirectBenchmark {
@Param(Array("10", "100", "1000", "10000", "100000"))
var size: Int = _
@Param(Array("true", "false"))
var ascii: Boolean = _
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size)
val utf16Bytes = string.getBytes("UTF-16")
@Benchmark
def decode(): String = {
val buffer = new Array[Char](utf16Bytes.length >> 1)
var i = 0
while (i < buffer.length) {
val bpos = i << 1
val c = (((utf16Bytes(bpos) & 0x00FF) << 8) + (utf16Bytes(bpos + 1) & 0x00FF)).toChar
buffer(i) = c
i += 1
}
new String(buffer)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment