Skip to content

Instantly share code, notes, and snippets.

@lu-zero
Last active May 22, 2026 11:32
Show Gist options
  • Select an option

  • Save lu-zero/c214975c965bdcc6b8f52434b699566c to your computer and use it in GitHub Desktop.

Select an option

Save lu-zero/c214975c965bdcc6b8f52434b699566c to your computer and use it in GitHub Desktop.
memcpy streaming-store benchmark: x86 AVX2 vs AArch64 STNP, with/without software prefetch
// Standalone memcpy-to-WC benchmark — x86 and AArch64.
// Compile: clang++ -O3 -march=native bench_memcpy_x86_neon.cpp -o bench && ./bench
//
// Tests streaming-store variants vs libc memcpy.
// On x86 the comparison is: AVX2 stream with/without prefetch.
// On AArch64 the comparison is: STNP with/without software prefetch.
// Also: NEON intrinsics and inline assembly variants.
// Run multiple times; the best-of-7 is reported.
#include <cstdint>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <time.h>
#if defined(__x86_64__) || defined(__i386__)
# include <immintrin.h>
# define HAVE_X86 1
#elif __has_builtin(__builtin_nontemporal_store)
# define HAVE_NT_BUILTIN 1
# define HAVE_NEON 1
#endif
#if HAVE_NEON
# include <arm_neon.h>
#endif
static double now_s() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
// ── x86 variants ─────────────────────────────────────────────────────────────
#if HAVE_X86
__attribute__((noinline))
static void stream_store(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32)
_mm256_stream_si256((__m256i*)d, _mm256_loadu_si256((const __m256i*)s));
_mm_sfence();
}
__attribute__((noinline))
static void stream_store_pf512(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) {
_mm_prefetch((const char*)(s + 512), _MM_HINT_T0);
_mm256_stream_si256((__m256i*)d, _mm256_loadu_si256((const __m256i*)s));
}
_mm_sfence();
}
__attribute__((noinline))
static void stream_store_pf2k(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) {
_mm_prefetch((const char*)(s + 2048), _MM_HINT_NTA);
_mm256_stream_si256((__m256i*)d, _mm256_loadu_si256((const __m256i*)s));
}
_mm_sfence();
}
#endif // HAVE_X86
// ── AArch64 builtin variants ────────────────────────────────────────────────
#if HAVE_NT_BUILTIN
typedef uint8_t __attribute__((vector_size(32), aligned(1))) nt_v256;
__attribute__((noinline))
static void nt_store(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) {
nt_v256 c; __builtin_memcpy(&c, s, 32);
__builtin_nontemporal_store(c, (nt_v256*)d);
}
}
__attribute__((noinline))
static void nt_store_pf512(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) {
__builtin_prefetch(s + 512, 0, 0);
nt_v256 c; __builtin_memcpy(&c, s, 32);
__builtin_nontemporal_store(c, (nt_v256*)d);
}
}
__attribute__((noinline))
static void nt_store_pf2k(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) {
__builtin_prefetch(s + 2048, 0, 0);
nt_v256 c; __builtin_memcpy(&c, s, 32);
__builtin_nontemporal_store(c, (nt_v256*)d);
}
}
#endif // HAVE_NT_BUILTIN
// ── AArch64 NEON intrinsics variants ────────────────────────────────────────
#if HAVE_NEON
__attribute__((noinline))
static void neon_store(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
size_t i = 0;
for (; i <= n - 64; i += 64, s += 64, d += 64) {
uint8x16_t v0 = vld1q_u8(s);
uint8x16_t v1 = vld1q_u8(s + 16);
uint8x16_t v2 = vld1q_u8(s + 32);
uint8x16_t v3 = vld1q_u8(s + 48);
vst1q_u8(d, v0);
vst1q_u8(d + 16, v1);
vst1q_u8(d + 32, v2);
vst1q_u8(d + 48, v3);
}
for (; i < n; i += 16, s += 16, d += 16) {
vst1q_u8(d, vld1q_u8(s));
}
}
__attribute__((noinline))
static void neon_store_pf512(void* __restrict dst, const void* __restrict src, size_t n) {
auto* d = (uint8_t*)dst;
auto* s = (const uint8_t*)src;
size_t i = 0;
for (; i <= n - 64; i += 64, s += 64, d += 64) {
__builtin_prefetch(s + 512, 0, 0);
uint8x16_t v0 = vld1q_u8(s);
uint8x16_t v1 = vld1q_u8(s + 16);
uint8x16_t v2 = vld1q_u8(s + 32);
uint8x16_t v3 = vld1q_u8(s + 48);
vst1q_u8(d, v0);
vst1q_u8(d + 16, v1);
vst1q_u8(d + 32, v2);
vst1q_u8(d + 48, v3);
}
for (; i < n; i += 16, s += 16, d += 16) {
vst1q_u8(d, vld1q_u8(s));
}
}
#endif // HAVE_NEON
// ── AArch64 inline assembly variants ────────────────────────────────────────
#if HAVE_NT_BUILTIN
__attribute__((noinline))
static void asm_memcpy_loop64(void* __restrict dst, const void* __restrict src, size_t count) {
if (count < 64) {
memcpy(dst, src, count);
return;
}
uint8_t* d = (uint8_t*)dst;
const uint8_t* s = (const uint8_t*)src;
size_t n = count;
__builtin_prefetch(s + 64, 0, 0);
__builtin_prefetch(s + 128, 0, 0);
__asm volatile(
" .p2align 3 \n"
"1: \n"
" prfm pldl1keep, [%[s], #192] \n"
" ldp q0, q1, [%[s]] \n"
" ldp q2, q3, [%[s], #32] \n"
" add %[s], %[s], #64 \n"
" stp q0, q1, [%[d]] \n"
" stp q2, q3, [%[d], #32] \n"
" add %[d], %[d], #64 \n"
" subs %[n], %[n], #64 \n"
" b.hi 1b \n"
: [d] "+r" (d), [s] "+r" (s), [n] "+r" (n)
:
: "memory", "cc", "v0", "v1", "v2", "v3"
);
if (n > 0) memcpy(d, s, n);
}
__attribute__((noinline))
static void asm_memcpy_simple(void* __restrict dst, const void* __restrict src, size_t count) {
if (count < 64) {
memcpy(dst, src, count);
return;
}
uint8_t* d = (uint8_t*)dst;
const uint8_t* s = (const uint8_t*)src;
size_t n = count;
__builtin_prefetch(s + 64, 0, 0);
__asm volatile(
" .p2align 3 \n"
"1: \n"
" ldp q0, q1, [%[s]] \n"
" ldp q2, q3, [%[s], #32] \n"
" add %[s], %[s], #64 \n"
" stp q0, q1, [%[d]] \n"
" stp q2, q3, [%[d], #32] \n"
" add %[d], %[d], #64 \n"
" subs %[n], %[n], #64 \n"
" b.hi 1b \n"
: [d] "+r" (d), [s] "+r" (s), [n] "+r" (n)
:
: "memory", "cc", "v0", "v1", "v2", "v3"
);
if (n > 0) memcpy(d, s, n);
}
#endif // HAVE_NT_BUILTIN
// ── runner ───────────────────────────────────────────────────────────────────
using fn_t = void(*)(void*, const void*, size_t);
static void bench(const char* name, fn_t fn, void* dst, const void* src, size_t sz) {
fn(dst, src, sz); // warmup
double best = 0;
for (int r = 0; r < 7; ++r) {
double t0 = now_s();
fn(dst, src, sz);
double dt = now_s() - t0;
double bw = sz / dt / 1e9;
if (bw > best) best = bw;
}
printf("%-30s %.2f GB/s\n", name, best);
}
int main() {
const size_t SZ = 1ULL << 30; // 1 GB
void* src = aligned_alloc(64, SZ);
void* dst = aligned_alloc(64, SZ);
memset(src, 0xAB, SZ);
memset(dst, 0xCD, SZ);
#if HAVE_X86
bench("avx2 stream (no prefetch)", stream_store, dst, src, SZ);
bench("avx2 stream + pf@512B", stream_store_pf512, dst, src, SZ);
bench("avx2 stream + pf@2KB", stream_store_pf2k, dst, src, SZ);
#elif HAVE_NT_BUILTIN
bench("stnp (no prefetch)", nt_store, dst, src, SZ);
bench("stnp + pf@512B", nt_store_pf512, dst, src, SZ);
bench("stnp + pf@2KB (current)", nt_store_pf2k, dst, src, SZ);
#endif
#if HAVE_NEON
bench("neon (no prefetch)", neon_store, dst, src, SZ);
bench("neon + pf@512B", neon_store_pf512, dst, src, SZ);
#endif
#if HAVE_NT_BUILTIN
bench("asm loop64 + prefetch", asm_memcpy_loop64, dst, src, SZ);
bench("asm simple", asm_memcpy_simple, dst, src, SZ);
#endif
bench("libc memcpy", (fn_t)memcpy, dst, src, SZ);
free(src);
free(dst);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment