Skip to content

Instantly share code, notes, and snippets.

View tanakamura's full-sized avatar

Takashi Nakamura tanakamura

View GitHub Profile
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
double getsec() {
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return tv.tv_sec + tv.tv_nsec/1e9;
}
/*
* clzero の後にstoreするとmemsetよりはるかに悪くなる
*
* * clzeroのみ :30[GB/s]
* * clzero + ストア :3[GB/s]
* * memset :12.5[GB/s]
*/
void *amd_clzero(void *dst, const void *src, size_t sz)
{
size_t line_size = 64;
num_thread = 1
libc-memset : 1024[ B] 42.105008[GB/s]
libc-memset : 2048[ B] 42.338436[GB/s]
libc-memset : 4096[ B] 49.077192[GB/s]
libc-memset : 8192[ B] 48.563454[GB/s]
libc-memset : 16[KB] 50.124791[GB/s]
libc-memset : 32[KB] 49.960671[GB/s]
libc-memset : 64[KB] 49.425556[GB/s]
libc-memset : 128[KB] 49.101868[GB/s]
libc-memset : 256[KB] 47.760674[GB/s]
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <sys/mman.h>
#include <getopt.h>
#include <sys/time.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#include <pthread.h>
static double getsec()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (ts.tv_nsec / 1e9) + ts.tv_sec;
}
static double get_sec() {
struct timespec ts;
#include <stdio.h>
#include <x86intrin.h>
#include <math.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
static double calib_ops() {
long long n = 1024*1024*4;
long long core_cycle = 64 * n;
#define _GNU_SOURCE
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdlib.h>
#include <cpuid.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>
AMD Ryzen 7 1700X Eight-Core Processor
4k page
parallel rand read : 0.464111[ns/read], range=64[KB]
parallel rand read : 0.494720[ns/read], range=128[KB]
parallel rand read : 0.567612[ns/read], range=256[KB]
parallel rand read : 0.642448[ns/read], range=512[KB]
parallel rand read : 0.713230[ns/read], range=1024[KB]
parallel rand read : 0.737040[ns/read], range=2048[KB]
parallel rand read : 0.773684[ns/read], range=4096[KB]
parallel rand read : 2.559083[ns/read], range=8192[KB]
Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
4k page
parallel rand read : 0.475830[ns/read], range=64[KB]
parallel rand read : 0.582672[ns/read], range=128[KB]
parallel rand read : 0.691971[ns/read], range=256[KB]
parallel rand read : 0.981163[ns/read], range=512[KB]
parallel rand read : 1.212128[ns/read], range=1024[KB]
parallel rand read : 1.309196[ns/read], range=2048[KB]
parallel rand read : 1.353122[ns/read], range=4096[KB]
parallel rand read : 2.068208[ns/read], range=8192[KB]
# DDR4 2400 - 2ch, 4thread (theoretical peak = 38.6GB/s)
num_thread = 1
libc-memset : 1024[ B] 43.663790[GB/s]
libc-memset : 2048[ B] 46.885033[GB/s]
libc-memset : 4096[ B] 48.524845[GB/s]
libc-memset : 8192[ B] 48.603032[GB/s]
libc-memset : 16[KB] 46.270248[GB/s]
libc-memset : 32[KB] 49.114524[GB/s]
libc-memset : 64[KB] 47.925331[GB/s]