Skip to content

Instantly share code, notes, and snippets.

Created November 2, 2023 16:42
Show Gist options
  • Save hugosenari/8f1ccc6116ba434fb042b984d73a5656 to your computer and use it in GitHub Desktop.
Save hugosenari/8f1ccc6116ba434fb042b984d73a5656 to your computer and use it in GitHub Desktop.
CopyMen Size Bench
import std/[macros, monotimes, strformat, strutils, times]
# source
Experimental API, subject to change.
Future work:
* convert ticks to time; see some approaches here:
* provide feature detection to test whether the CPU supports it (on linux, via /proc/cpuinfo)
* test on ARMv8-A, ARMv8-M, arm64
## js
* we use ``
## nodejs
* we use `process.hrtime.bigint()`
## ARM
* The ARMv8-A architecture[1] manual explicitly states that two reads to the PMCCNTR_EL0 register may return the same value[1a].
There is also the CNTVCT_EL0[1b] register, however it's unclear whether that register is even monotonic (it's implied, but not stated explicitly).
The ARMv8-M architecture[2] has the CYCCNT register, however all that's mentioned is that it is an "optional free-running 32-bit cycle counter"[2a].
## references
[1a] See [1], PDF page 2852
[2a] See [2]. PDF page 367
## further links
when defined(js):
proc getCpuTicksImpl(): int64 =
## Returns ticks in nanoseconds.
# xxx consider returning JsBigInt instead of float
when defined(nodejs):
{.emit: """
let process = require('process');
`result` = Number(process.hrtime.bigint());
proc jsNow(): int64 {.importjs: "".}
result = jsNow() * 1_000_000
const header =
when defined(posix): "<x86intrin.h>"
else: "<intrin.h>"
proc getCpuTicksImpl(): uint64 {.importc: "__rdtsc", header: header.}
template getCpuTicks*(): int64 =
## Returns number of CPU ticks as given by a platform specific timestamp counter,
## oftentimes the `RDTSC` instruction.
## Unlike `std/monotimes.ticks`, this gives a strictly monotonic counter at least
## on recent enough x86 platforms, and has higher resolution and lower overhead,
## allowing to measure individual instructions (corresponding to time offsets in
## the nanosecond range). A best effort implementation is provided when a timestamp
## counter is not available.
## Note that the CPU may reorder instructions.
for i in 0..<100:
let t1 = getCpuTicks()
# code to benchmark can go here
let t2 = getCpuTicks()
assert t2 > t1
template toInt64(a, b): untyped =
cast[int64](cast[uint64](a) or (cast[uint64](d) shl 32))
proc getCpuTicksStart*(): int64 {.inline.} =
## Variant of `getCpuTicks` which uses the `RDTSCP` instruction. Compared to
## `getCpuTicks`, this avoids introducing noise in the measurements caused by
## CPU instruction reordering, and can result in more deterministic results,
## at the expense of extra overhead and requiring asymetric start/stop APIs.
## A best effort implementation is provided for platforms where `RDTSCP` is
## not available.
var a = 0
for i in 0..<100:
let t1 = getCpuTicksStart()
# code to benchmark can go here
let t2 = getCpuTicksEnd()
assert t2 > t1, $(t1, t2)
when nimvm: result = getCpuTicks()
when defined(js): result = getCpuTicks()
var a {.noinit.}: cuint
var d {.noinit.}: cuint
# See
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
asm volatile("rdtsc" : "=a" (a), "=d" (d));
result = toInt64(a, b)
proc getCpuTicksEnd*(): int64 {.inline.} =
## See `getCpuTicksStart <#getCpuTicksStart>`_
when nimvm: result = getCpuTicks()
when defined(js): result = getCpuTicks()
var a {.noinit.}: cuint
var d {.noinit.}: cuint
asm volatile("rdtscp" : "=a" (a), "=d" (d));
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
result = toInt64(a, b)
macro unroll(x: static int, name, body: untyped) =
result = newStmtList()
var a = 512
while a < x:
a = a * 2
result.add newBlockStmt newStmtList(
newConstStmt(name, newLit a),
copy body
result.add newBlockStmt newStmtList(
newConstStmt(name, newLit (a + (a div 2))),
copy body
result.add newBlockStmt newStmtList(
newConstStmt(name, newLit (a + (a div 2) + (a div 4))),
copy body
result.add newBlockStmt newStmtList(
newConstStmt(name, newLit (a + (a div 2) + (a div 4) + (a div 8))),
copy body
result.add newBlockStmt newStmtList(
newConstStmt(name, newLit (a + (a div 2) + (a div 4) + (a div 8) + (a div 16))),
copy body
template timeInNS(body: untyped): int64 =
start = getMonotime()
endt = getMonotime()
noise = (endt - start).inNanoseconds
start = getMonotime()
endt = getMonotime()
abs((endt - start).inNanoseconds - noise) + 1
template ticks(body: untyped): int64 =
start = getCpuTicksStart()
endt = getCpuTicksEnd()
noise = endt - start
start = getCpuTicksStart()
endt = getCpuTicksEnd()
abs(endt - start - noise) + 1
proc cope(size: static int): void =
bufferA = create(uint8, size)
bufferB = create(uint8, size)
chicks = ticks:
copyMem bufferB, bufferA, size
tns = timeInNS:
copyMem bufferA, bufferB, size
echo &"{size:>8} bytes, {chicks:>8} ticks, {(size div chicks):>4} B/tick, {tns:>8} ns, {(size div tns):>4} B/ns"
const maxSize {.intdefine: ".maxSize".} = 262144
unroll(maxSize, size):
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment