Last active
May 22, 2026 11:32
-
-
Save lu-zero/c214975c965bdcc6b8f52434b699566c to your computer and use it in GitHub Desktop.
memcpy streaming-store benchmark: x86 AVX2 vs AArch64 STNP, with/without software prefetch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Standalone memcpy-to-WC benchmark — x86 and AArch64. | |
| // Compile: clang++ -O3 -march=native bench_memcpy_x86_neon.cpp -o bench && ./bench | |
| // | |
| // Tests streaming-store variants vs libc memcpy. | |
| // On x86 the comparison is: AVX2 stream with/without prefetch. | |
| // On AArch64 the comparison is: STNP with/without software prefetch. | |
| // Also: NEON intrinsics and inline assembly variants. | |
| // Run multiple times; the best-of-7 is reported. | |
| #include <cstdint> | |
| #include <cstring> | |
| #include <cstdlib> | |
| #include <cstdio> | |
| #include <time.h> | |
| #if defined(__x86_64__) || defined(__i386__) | |
| # include <immintrin.h> | |
| # define HAVE_X86 1 | |
| #elif __has_builtin(__builtin_nontemporal_store) | |
| # define HAVE_NT_BUILTIN 1 | |
| # define HAVE_NEON 1 | |
| #endif | |
| #if HAVE_NEON | |
| # include <arm_neon.h> | |
| #endif | |
| static double now_s() { | |
| struct timespec ts; | |
| clock_gettime(CLOCK_MONOTONIC, &ts); | |
| return ts.tv_sec + ts.tv_nsec * 1e-9; | |
| } | |
| // ── x86 variants ───────────────────────────────────────────────────────────── | |
| #if HAVE_X86 | |
| __attribute__((noinline)) | |
| static void stream_store(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) | |
| _mm256_stream_si256((__m256i*)d, _mm256_loadu_si256((const __m256i*)s)); | |
| _mm_sfence(); | |
| } | |
| __attribute__((noinline)) | |
| static void stream_store_pf512(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) { | |
| _mm_prefetch((const char*)(s + 512), _MM_HINT_T0); | |
| _mm256_stream_si256((__m256i*)d, _mm256_loadu_si256((const __m256i*)s)); | |
| } | |
| _mm_sfence(); | |
| } | |
| __attribute__((noinline)) | |
| static void stream_store_pf2k(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) { | |
| _mm_prefetch((const char*)(s + 2048), _MM_HINT_NTA); | |
| _mm256_stream_si256((__m256i*)d, _mm256_loadu_si256((const __m256i*)s)); | |
| } | |
| _mm_sfence(); | |
| } | |
| #endif // HAVE_X86 | |
| // ── AArch64 builtin variants ──────────────────────────────────────────────── | |
| #if HAVE_NT_BUILTIN | |
| typedef uint8_t __attribute__((vector_size(32), aligned(1))) nt_v256; | |
| __attribute__((noinline)) | |
| static void nt_store(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) { | |
| nt_v256 c; __builtin_memcpy(&c, s, 32); | |
| __builtin_nontemporal_store(c, (nt_v256*)d); | |
| } | |
| } | |
| __attribute__((noinline)) | |
| static void nt_store_pf512(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) { | |
| __builtin_prefetch(s + 512, 0, 0); | |
| nt_v256 c; __builtin_memcpy(&c, s, 32); | |
| __builtin_nontemporal_store(c, (nt_v256*)d); | |
| } | |
| } | |
| __attribute__((noinline)) | |
| static void nt_store_pf2k(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| for (size_t i = 0; i < n / 32; ++i, s += 32, d += 32) { | |
| __builtin_prefetch(s + 2048, 0, 0); | |
| nt_v256 c; __builtin_memcpy(&c, s, 32); | |
| __builtin_nontemporal_store(c, (nt_v256*)d); | |
| } | |
| } | |
| #endif // HAVE_NT_BUILTIN | |
| // ── AArch64 NEON intrinsics variants ──────────────────────────────────────── | |
| #if HAVE_NEON | |
| __attribute__((noinline)) | |
| static void neon_store(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| size_t i = 0; | |
| for (; i <= n - 64; i += 64, s += 64, d += 64) { | |
| uint8x16_t v0 = vld1q_u8(s); | |
| uint8x16_t v1 = vld1q_u8(s + 16); | |
| uint8x16_t v2 = vld1q_u8(s + 32); | |
| uint8x16_t v3 = vld1q_u8(s + 48); | |
| vst1q_u8(d, v0); | |
| vst1q_u8(d + 16, v1); | |
| vst1q_u8(d + 32, v2); | |
| vst1q_u8(d + 48, v3); | |
| } | |
| for (; i < n; i += 16, s += 16, d += 16) { | |
| vst1q_u8(d, vld1q_u8(s)); | |
| } | |
| } | |
| __attribute__((noinline)) | |
| static void neon_store_pf512(void* __restrict dst, const void* __restrict src, size_t n) { | |
| auto* d = (uint8_t*)dst; | |
| auto* s = (const uint8_t*)src; | |
| size_t i = 0; | |
| for (; i <= n - 64; i += 64, s += 64, d += 64) { | |
| __builtin_prefetch(s + 512, 0, 0); | |
| uint8x16_t v0 = vld1q_u8(s); | |
| uint8x16_t v1 = vld1q_u8(s + 16); | |
| uint8x16_t v2 = vld1q_u8(s + 32); | |
| uint8x16_t v3 = vld1q_u8(s + 48); | |
| vst1q_u8(d, v0); | |
| vst1q_u8(d + 16, v1); | |
| vst1q_u8(d + 32, v2); | |
| vst1q_u8(d + 48, v3); | |
| } | |
| for (; i < n; i += 16, s += 16, d += 16) { | |
| vst1q_u8(d, vld1q_u8(s)); | |
| } | |
| } | |
| #endif // HAVE_NEON | |
| // ── AArch64 inline assembly variants ──────────────────────────────────────── | |
| #if HAVE_NT_BUILTIN | |
| __attribute__((noinline)) | |
| static void asm_memcpy_loop64(void* __restrict dst, const void* __restrict src, size_t count) { | |
| if (count < 64) { | |
| memcpy(dst, src, count); | |
| return; | |
| } | |
| uint8_t* d = (uint8_t*)dst; | |
| const uint8_t* s = (const uint8_t*)src; | |
| size_t n = count; | |
| __builtin_prefetch(s + 64, 0, 0); | |
| __builtin_prefetch(s + 128, 0, 0); | |
| __asm volatile( | |
| " .p2align 3 \n" | |
| "1: \n" | |
| " prfm pldl1keep, [%[s], #192] \n" | |
| " ldp q0, q1, [%[s]] \n" | |
| " ldp q2, q3, [%[s], #32] \n" | |
| " add %[s], %[s], #64 \n" | |
| " stp q0, q1, [%[d]] \n" | |
| " stp q2, q3, [%[d], #32] \n" | |
| " add %[d], %[d], #64 \n" | |
| " subs %[n], %[n], #64 \n" | |
| " b.hi 1b \n" | |
| : [d] "+r" (d), [s] "+r" (s), [n] "+r" (n) | |
| : | |
| : "memory", "cc", "v0", "v1", "v2", "v3" | |
| ); | |
| if (n > 0) memcpy(d, s, n); | |
| } | |
| __attribute__((noinline)) | |
| static void asm_memcpy_simple(void* __restrict dst, const void* __restrict src, size_t count) { | |
| if (count < 64) { | |
| memcpy(dst, src, count); | |
| return; | |
| } | |
| uint8_t* d = (uint8_t*)dst; | |
| const uint8_t* s = (const uint8_t*)src; | |
| size_t n = count; | |
| __builtin_prefetch(s + 64, 0, 0); | |
| __asm volatile( | |
| " .p2align 3 \n" | |
| "1: \n" | |
| " ldp q0, q1, [%[s]] \n" | |
| " ldp q2, q3, [%[s], #32] \n" | |
| " add %[s], %[s], #64 \n" | |
| " stp q0, q1, [%[d]] \n" | |
| " stp q2, q3, [%[d], #32] \n" | |
| " add %[d], %[d], #64 \n" | |
| " subs %[n], %[n], #64 \n" | |
| " b.hi 1b \n" | |
| : [d] "+r" (d), [s] "+r" (s), [n] "+r" (n) | |
| : | |
| : "memory", "cc", "v0", "v1", "v2", "v3" | |
| ); | |
| if (n > 0) memcpy(d, s, n); | |
| } | |
| #endif // HAVE_NT_BUILTIN | |
| // ── runner ─────────────────────────────────────────────────────────────────── | |
| using fn_t = void(*)(void*, const void*, size_t); | |
| static void bench(const char* name, fn_t fn, void* dst, const void* src, size_t sz) { | |
| fn(dst, src, sz); // warmup | |
| double best = 0; | |
| for (int r = 0; r < 7; ++r) { | |
| double t0 = now_s(); | |
| fn(dst, src, sz); | |
| double dt = now_s() - t0; | |
| double bw = sz / dt / 1e9; | |
| if (bw > best) best = bw; | |
| } | |
| printf("%-30s %.2f GB/s\n", name, best); | |
| } | |
| int main() { | |
| const size_t SZ = 1ULL << 30; // 1 GB | |
| void* src = aligned_alloc(64, SZ); | |
| void* dst = aligned_alloc(64, SZ); | |
| memset(src, 0xAB, SZ); | |
| memset(dst, 0xCD, SZ); | |
| #if HAVE_X86 | |
| bench("avx2 stream (no prefetch)", stream_store, dst, src, SZ); | |
| bench("avx2 stream + pf@512B", stream_store_pf512, dst, src, SZ); | |
| bench("avx2 stream + pf@2KB", stream_store_pf2k, dst, src, SZ); | |
| #elif HAVE_NT_BUILTIN | |
| bench("stnp (no prefetch)", nt_store, dst, src, SZ); | |
| bench("stnp + pf@512B", nt_store_pf512, dst, src, SZ); | |
| bench("stnp + pf@2KB (current)", nt_store_pf2k, dst, src, SZ); | |
| #endif | |
| #if HAVE_NEON | |
| bench("neon (no prefetch)", neon_store, dst, src, SZ); | |
| bench("neon + pf@512B", neon_store_pf512, dst, src, SZ); | |
| #endif | |
| #if HAVE_NT_BUILTIN | |
| bench("asm loop64 + prefetch", asm_memcpy_loop64, dst, src, SZ); | |
| bench("asm simple", asm_memcpy_simple, dst, src, SZ); | |
| #endif | |
| bench("libc memcpy", (fn_t)memcpy, dst, src, SZ); | |
| free(src); | |
| free(dst); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment