Created
November 2, 2023 16:42
-
-
Save hugosenari/8f1ccc6116ba434fb042b984d73a5656 to your computer and use it in GitHub Desktop.
CopyMen Size Bench
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import std/[macros, monotimes, strformat, strutils, times] | |
# source | |
# https://github.com/timotheecour/Nim/blob/94a32119cb5eeeff2a825dc29cbbe60accb6432e/lib/std/cputicks.nim | |
##[ | |
Experimental API, subject to change. | |
]## | |
#[ | |
Future work: | |
* convert ticks to time; see some approaches here: https://quick-bench.com/q/WcbqUWBCoNBJvCP4n8h3kYfZDXU | |
* provide feature detection to test whether the CPU supports it (on linux, via /proc/cpuinfo) | |
* test on ARMv8-A, ARMv8-M, arm64 | |
## js | |
* we use `window.performance.now()` | |
## nodejs | |
* we use `process.hrtime.bigint()` | |
## ARM | |
* The ARMv8-A architecture[1] manual explicitly states that two reads to the PMCCNTR_EL0 register may return the same value[1a]. | |
There is also the CNTVCT_EL0[1b] register, however it's unclear whether that register is even monotonic (it's implied, but not stated explicitly). | |
The ARMv8-M architecture[2] has the CYCCNT register, however all that's mentioned is that it is an "optional free-running 32-bit cycle counter"[2a]. | |
## references | |
[1] https://documentation-service.arm.com/static/611fa684674a052ae36c7c91 | |
[1a] See [1], PDF page 2852 | |
[2] https://documentation-service.arm.com/static/60e6f8573d73a34b640e0cee | |
[2a] See [2]. PDF page 367 | |
## further links | |
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf | |
* https://gist.github.com/savanovich/f07eda9dba9300eb9ccf | |
* https://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched# | |
]# | |
when defined(js): | |
proc getCpuTicksImpl(): int64 = | |
## Returns ticks in nanoseconds. | |
# xxx consider returning JsBigInt instead of float | |
when defined(nodejs): | |
{.emit: """ | |
let process = require('process'); | |
`result` = Number(process.hrtime.bigint()); | |
""".} | |
else: | |
proc jsNow(): int64 {.importjs: "window.performance.now()".} | |
result = jsNow() * 1_000_000 | |
else: | |
const header = | |
when defined(posix): "<x86intrin.h>" | |
else: "<intrin.h>" | |
proc getCpuTicksImpl(): uint64 {.importc: "__rdtsc", header: header.} | |
template getCpuTicks*(): int64 = | |
## Returns number of CPU ticks as given by a platform specific timestamp counter, | |
## oftentimes the `RDTSC` instruction. | |
## Unlike `std/monotimes.ticks`, this gives a strictly monotonic counter at least | |
## on recent enough x86 platforms, and has higher resolution and lower overhead, | |
## allowing to measure individual instructions (corresponding to time offsets in | |
## the nanosecond range). A best effort implementation is provided when a timestamp | |
## counter is not available. | |
## | |
## Note that the CPU may reorder instructions. | |
runnableExamples: | |
for i in 0..<100: | |
let t1 = getCpuTicks() | |
# code to benchmark can go here | |
let t2 = getCpuTicks() | |
assert t2 > t1 | |
cast[int64](getCpuTicksImpl()) | |
template toInt64(a, b): untyped = | |
cast[int64](cast[uint64](a) or (cast[uint64](d) shl 32)) | |
proc getCpuTicksStart*(): int64 {.inline.} = | |
## Variant of `getCpuTicks` which uses the `RDTSCP` instruction. Compared to | |
## `getCpuTicks`, this avoids introducing noise in the measurements caused by | |
## CPU instruction reordering, and can result in more deterministic results, | |
## at the expense of extra overhead and requiring asymetric start/stop APIs. | |
## | |
## A best effort implementation is provided for platforms where `RDTSCP` is | |
## not available. | |
runnableExamples: | |
var a = 0 | |
for i in 0..<100: | |
let t1 = getCpuTicksStart() | |
# code to benchmark can go here | |
let t2 = getCpuTicksEnd() | |
assert t2 > t1, $(t1, t2) | |
when nimvm: result = getCpuTicks() | |
else: | |
when defined(js): result = getCpuTicks() | |
else: | |
var a {.noinit.}: cuint | |
var d {.noinit.}: cuint | |
# See https://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched | |
{.emit:""" | |
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); | |
asm volatile("rdtsc" : "=a" (a), "=d" (d)); | |
""".} | |
result = toInt64(a, b) | |
proc getCpuTicksEnd*(): int64 {.inline.} = | |
## See `getCpuTicksStart <#getCpuTicksStart>`_ | |
when nimvm: result = getCpuTicks() | |
else: | |
when defined(js): result = getCpuTicks() | |
else: | |
var a {.noinit.}: cuint | |
var d {.noinit.}: cuint | |
{.emit:""" | |
asm volatile("rdtscp" : "=a" (a), "=d" (d)); | |
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); | |
""".} | |
result = toInt64(a, b) | |
# https://forum.nim-lang.org/t/9504 | |
macro unroll(x: static int, name, body: untyped) = | |
result = newStmtList() | |
var a = 512 | |
while a < x: | |
a = a * 2 | |
result.add newBlockStmt newStmtList( | |
newConstStmt(name, newLit a), | |
copy body | |
) | |
result.add newBlockStmt newStmtList( | |
newConstStmt(name, newLit (a + (a div 2))), | |
copy body | |
) | |
result.add newBlockStmt newStmtList( | |
newConstStmt(name, newLit (a + (a div 2) + (a div 4))), | |
copy body | |
) | |
result.add newBlockStmt newStmtList( | |
newConstStmt(name, newLit (a + (a div 2) + (a div 4) + (a div 8))), | |
copy body | |
) | |
result.add newBlockStmt newStmtList( | |
newConstStmt(name, newLit (a + (a div 2) + (a div 4) + (a div 8) + (a div 16))), | |
copy body | |
) | |
template timeInNS(body: untyped): int64 = | |
var | |
start = getMonotime() | |
endt = getMonotime() | |
noise = (endt - start).inNanoseconds | |
start = getMonotime() | |
body | |
endt = getMonotime() | |
abs((endt - start).inNanoseconds - noise) + 1 | |
template ticks(body: untyped): int64 = | |
var | |
start = getCpuTicksStart() | |
endt = getCpuTicksEnd() | |
noise = endt - start | |
start = getCpuTicksStart() | |
body | |
endt = getCpuTicksEnd() | |
abs(endt - start - noise) + 1 | |
proc cope(size: static int): void = | |
var | |
bufferA = create(uint8, size) | |
bufferB = create(uint8, size) | |
chicks = ticks: | |
copyMem bufferB, bufferA, size | |
tns = timeInNS: | |
copyMem bufferA, bufferB, size | |
echo &"{size:>8} bytes, {chicks:>8} ticks, {(size div chicks):>4} B/tick, {tns:>8} ns, {(size div tns):>4} B/ns" | |
const maxSize {.intdefine: ".maxSize".} = 262144 | |
unroll(maxSize, size): | |
cope(size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment