Skip to content

Instantly share code, notes, and snippets.

@Ichoran
Created October 16, 2024 06:29
Show Gist options
  • Save Ichoran/bcb3fc58ce27a11e21c59df958ccb085 to your computer and use it in GitHub Desktop.
Save Ichoran/bcb3fc58ce27a11e21c59df958ccb085 to your computer and use it in GitHub Desktop.
Benchmark to test whether ThreadLocal creation (via DynamicVariable) really impacts Loom threads
//> using scala 3.5.0
//> using dep com.github.ichoran::kse3-basics:0.3.11
//> using dep com.github.ichoran::kse3-flow:0.3.11
// Run with scala-cli --power --jmh --jvm=21 ThreadLocalLoom.scala
// If you change the classes that have to be benchmarked, you may have to rm -r .scala_build
package threadlocalloom.bench
import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole
import java.util.concurrent.TimeUnit
import kse.basics.*
import kse.basics.intervals.*
import kse.flow.*
class Compute(input: String):
@volatile var output: Long = 0L
def compute(modify: Int => Int): Unit =
val hc = scala.util.hashing.MurmurHash3.stringHash(input)
output = output + modify(hc & 0x7FFFFFFF)
class Computeless(computes: Array[Compute]):
def run(): Long =
computes.gather(0L)()((a, x, _) => a + x.output)
class ComputeSingleThreaded(computes: Array[Compute]):
def run(): Long =
computes.peek()(_.compute(x => x - (x % 4)))
computes.gather(0L)()((a, x, _) => a + x.output)
class ComputeStatic(computes: Array[Compute]):
def run(): Long =
val handle = Fu:
val handles = computes.copyWith: c =>
Fu:
c.compute(x => x - (x % 4))
handles.peek(){ _.? }
handle.ask().get
computes.gather(0L)()((a, x, _) => a + x.output)
class ComputeStaticWithLocals(computes: Array[Compute]):
val dv0 = scala.util.DynamicVariable("eel")
val dv1 = scala.util.DynamicVariable("cod")
val dv2 = scala.util.DynamicVariable("herring")
val dv3 = scala.util.DynamicVariable("perch")
val dv4 = scala.util.DynamicVariable("bass")
val dv5 = scala.util.DynamicVariable("salmon")
val dv6 = scala.util.DynamicVariable("sole")
val dv7 = scala.util.DynamicVariable("minnow")
val dv8 = scala.util.DynamicVariable("tuna")
val dv9 = scala.util.DynamicVariable("sturgeon")
def run(): Long =
val handle = Fu:
val handles = computes.copyWith: c =>
Fu:
c.compute(x => x - (x % 4))
handles.peek(){ _.? }
handle.ask().get
computes.gather(0L)()((a, x, _) => a + x.output)
class ComputeAtomic(computes: Array[Compute]):
val a = Atom(4)
def run(): Long =
val handle = Fu:
val handles = computes.copyWith: c =>
Fu:
c.compute(x => x - (x % a.swapOp(_ + 2)))
handles.peek(){ _.? }
handle.ask().get
computes.gather(0L)()((a, x, _) => a + x.output)
class ComputeThreadLocal(computes: Array[Compute]):
val dv = scala.util.DynamicVariable(1)
def run(): Long =
val handle = Fu:
dv.withValue(15):
val handles = computes.copyWith: c =>
Fu:
c.compute(x => x - (x % dv.value))
handles.peek(){ _.? }
handle.ask().get
computes.gather(0L)()((a, x, _) => a + x.output)
class ComputeThreadLocalMod(computes: Array[Compute]):
val dv = scala.util.DynamicVariable(1)
def run(): Long =
val handle = Fu:
dv.withValue(15):
val handles = computes.copyWith: c =>
Fu:
c.compute{ x => dv.value = 17; x - (x % dv.value) }
handles.peek(){ _.? }
handle.ask().get
computes.gather(0L)()((a, x, _) => a + x.output)
@State(Scope.Benchmark)
@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@Warmup(iterations = 10, time = 1000, timeUnit = TimeUnit.MILLISECONDS)
@Measurement(iterations = 20, time = 5000, timeUnit = TimeUnit.MILLISECONDS)
@Fork(2)
@Threads(1)
class ThreadLocalLoom() {
val source = """
Once upon a time there was a very unusual proposal for the Java Virtual Machine.
Even though everyone had agreed after the initial implementation of green threads
that OS threads were superior to green threads, and Java should no longer support
them, a small and intrepid team believed that, in fact, Java could support green
threads as long as they didn't mention the color green. But one huge roadblock
stood in their way: ThreadLocal variables. This is the story of the success or
failure of the architects of Project Loom to tame the performance impact of
ThreadLocal variables. This is also an unreasonably long string because the core
part of the story is actually going to involve computing hash codes on subsets of
this string, and therefore if the story is insufficiently long, there won't be
enough compute to matter. I mean, I could just duplicate it a bunch of times.
Actually, why don't I stop writing and just do that now? Sorry about all this.
"""
val pieces = Iv.of(source).iNOp(_/2).where().copyWith: i =>
source.select(i to End)
@Benchmark
def nothing(bh: Blackhole): Unit =
val x = Computeless(pieces.copyWith(x => Compute(x)))
bh.consume(x.run())
@Benchmark
def base(bh: Blackhole): Unit =
val x = ComputeStatic(pieces.copyWith(x => Compute(x)))
bh.consume(x.run())
@Benchmark
def local10(bh: Blackhole): Unit =
val x = ComputeStaticWithLocals(pieces.copyWith(x => Compute(x)))
bh.consume(x.run())
@Benchmark
def atom(bh: Blackhole): Unit =
val x = ComputeAtomic(pieces.copyWith(x => Compute(x)))
bh.consume(x.run())
@Benchmark
def localuse(bh: Blackhole): Unit =
val x = ComputeThreadLocal(pieces.copyWith(x => Compute(x)))
bh.consume(x.run())
@Benchmark
def localmod(bh: Blackhole): Unit =
val x = ComputeThreadLocalMod(pieces.copyWith(x => Compute(x)))
bh.consume(x.run())
@Benchmark
def single(bh: Blackhole): Unit =
val x = ComputeSingleThreaded(pieces.copyWith(x => Compute(x)))
bh.consume(x.run())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment