Created
July 6, 2015 17:17
-
-
Save andresilva/c19c1ab03d46fc9bedc4 to your computer and use it in GitHub Desktop.
JVM string decoding performance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package benchmark | |
import java.nio._ | |
import java.util.concurrent.TimeUnit | |
import org.openjdk.jmh.annotations._ | |
object Random { | |
def random = new scala.util.Random(0) | |
def nextString(size: Int) = random.nextString(size) | |
def nextAsciiString(size: Int) = { | |
val r = random | |
Array.fill[Char](100000)(r.nextPrintableChar).mkString | |
} | |
} | |
@BenchmarkMode(Array(Mode.Throughput)) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Fork(1) | |
@State(Scope.Benchmark) | |
class UTF8StringBenchmark { | |
@Param(Array("10", "100", "1000", "10000", "100000")) | |
var size: Int = _ | |
@Param(Array("true", "false")) | |
var ascii: Boolean = _ | |
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size) | |
val utf8Bytes = string.getBytes("UTF-8") | |
@Benchmark | |
def decode(): String = { | |
new String(utf8Bytes, 0, utf8Bytes.length, "UTF-8") | |
} | |
} | |
object UTF8DFA { | |
final val UTF8_ACCEPT = 0 | |
final val UTF8_REJECT = 12 | |
final val utf8d: Array[Byte] = Array( | |
// The first part of the table maps bytes to character classes that | |
// to reduce the size of the transition table and create bitmasks. | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, | |
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, | |
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
// The second part is a transition table that maps a combination | |
// of a state of the automaton and a character class to a state. | |
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, | |
12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, | |
12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, | |
12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, | |
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12) | |
def decode(in: Array[Byte], offset: Int, length: Int, out: Array[Char]): Int = { | |
val end = offset + length | |
var i = offset | |
var o = 0 | |
var state = UTF8_ACCEPT | |
var codepoint = 0 | |
while (i < end) { | |
val b = in(i) & 0xff; i += 1 | |
if (state != UTF8_ACCEPT) { | |
val ttype = utf8d(b) | |
codepoint = (codepoint << 6) | (b & 63) | |
state = utf8d(256 + state + ttype) | |
if (state == UTF8_ACCEPT) { | |
if (codepoint > 0xffff) { | |
out(o) = (0xd7c0 + (codepoint >> 10)).toChar; o += 1; | |
out(o) = (0xdc00 + (codepoint & 0x3ff)).toChar; o += 1; | |
} else { | |
out(o) = codepoint.toChar; o += 1 | |
} | |
} | |
} else if (b > 0x7f) { | |
val ttype = utf8d(b) | |
codepoint = b & (255 >> ttype) | |
state = utf8d(256 + ttype) | |
} else { | |
out(o) = b.toChar; o += 1; | |
} | |
} | |
if (state != UTF8_ACCEPT) throw new Exception() | |
o | |
} | |
def decode(in: Array[Byte], offset: Int, length: Int): String = { | |
val ret = new Array[Char](in.length) | |
val size = decode(in, offset, length, ret) | |
new String(ret, 0, size) | |
} | |
def decode(in: Array[Byte]): String = decode(in, 0, in.length) | |
} | |
@BenchmarkMode(Array(Mode.Throughput)) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Fork(1) | |
@State(Scope.Benchmark) | |
class UTF8DFAStringBenchmark { | |
@Param(Array("10", "100", "1000", "10000", "100000")) | |
var size: Int = _ | |
@Param(Array("true", "false")) | |
var ascii: Boolean = _ | |
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size) | |
val utf8Bytes = string.getBytes("UTF-8") | |
@Benchmark | |
def decode(): String = { | |
UTF8DFA.decode(utf8Bytes) | |
} | |
} | |
@BenchmarkMode(Array(Mode.Throughput)) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Fork(1) | |
@State(Scope.Benchmark) | |
class UTF16StringBenchmark { | |
@Param(Array("10", "100", "1000", "10000", "100000")) | |
var size: Int = _ | |
@Param(Array("true", "false")) | |
var ascii: Boolean = _ | |
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size) | |
val utf16Bytes = string.getBytes("UTF-16") | |
@Benchmark | |
def decode(): String = { | |
new String(utf16Bytes, 0, utf16Bytes.length, "UTF-16") | |
} | |
} | |
@BenchmarkMode(Array(Mode.Throughput)) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Fork(1) | |
@State(Scope.Benchmark) | |
class ASCIIStringBenchmark { | |
@Param(Array("10", "100", "1000", "10000", "100000")) | |
var size: Int = _ | |
@Param(Array("true", "false")) | |
var ascii: Boolean = _ | |
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size) | |
val asciiBytes = string.getBytes("ASCII") | |
@Benchmark | |
def decode(): String = { | |
new String(asciiBytes, 0, asciiBytes.length, "ASCII") | |
} | |
} | |
@BenchmarkMode(Array(Mode.Throughput)) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Fork(1) | |
@State(Scope.Benchmark) | |
class ASCIIDirectStringBenchmark { | |
@Param(Array("10", "100", "1000", "10000", "100000")) | |
var size: Int = _ | |
@Param(Array("true", "false")) | |
var ascii: Boolean = _ | |
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size) | |
val asciiBytes = string.getBytes("ASCII") | |
@Benchmark | |
def decode(): String = { | |
val buffer = new Array[Char](asciiBytes.length) | |
var i = 0 | |
while (i < buffer.length) { | |
buffer(i) = asciiBytes(i).toChar | |
i += 1 | |
} | |
new String(buffer) | |
} | |
} | |
@BenchmarkMode(Array(Mode.Throughput)) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Fork(1) | |
@State(Scope.Benchmark) | |
class UTF16StringDirectBenchmark { | |
@Param(Array("10", "100", "1000", "10000", "100000")) | |
var size: Int = _ | |
@Param(Array("true", "false")) | |
var ascii: Boolean = _ | |
val string = if (ascii) Random.nextAsciiString(size) else Random.nextString(size) | |
val utf16Bytes = string.getBytes("UTF-16") | |
@Benchmark | |
def decode(): String = { | |
val buffer = new Array[Char](utf16Bytes.length >> 1) | |
var i = 0 | |
while (i < buffer.length) { | |
val bpos = i << 1 | |
val c = (((utf16Bytes(bpos) & 0x00FF) << 8) + (utf16Bytes(bpos + 1) & 0x00FF)).toChar | |
buffer(i) = c | |
i += 1 | |
} | |
new String(buffer) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment