Skip to content

Instantly share code, notes, and snippets.

@jwatte
Created July 3, 2022 20:56
Show Gist options
  • Save jwatte/cafe738fb9d0d45c648d470d629d091b to your computer and use it in GitHub Desktop.
Save jwatte/cafe738fb9d0d45c648d470d629d091b to your computer and use it in GitHub Desktop.
test: malloc-release malloc-debug
./malloc-debug
./malloc-release
malloc-debug: MallocBenchmark.cpp
clang -O0 -D_DEBUG=1 -g -o malloc-debug MallocBenchmark.cpp
malloc-release: MallocBenchmark.cpp
clang -O3 -DNDEBUG=1 -o malloc-release MallocBenchmark.cpp
/*
* Simple malloc timing program -- how much overhead is there in malloc/free?
* Looks like MSVC is 6x slower in debug, 4x slower in release, compared to
* clang on Linux. This is when doing matched mallocs and frees.
*/
// AMD Ryzen Threadripper 1950x at 3.6 GHz
// Windows 10 Pro
// Visual Studio 2019, x64, debug:
// WIN32 DEBUG: 100000000 iterations, 39.072507 seconds: 390.725 ns/iter
// WIN32 RELEASE: 100000000 iterations, 19.773716 seconds: 197.737 ns/iter
//
// model name : Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
// cpu MHz : 3501.426
// Linux ip-172-31-31-60 5.13.0-1031-aws #35~20.04.1-Ubuntu SMP Mon Jun 13 22:30:30 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
// Ubuntu clang version 13.0.1
//
// POSIX DEBUG: 100000000 iterations, 6.537821 seconds: 65.378 ns/iter
// POSIX RELEASE: 100000000 iterations, 5.667803 seconds: 56.678 ns/iter
#if defined(_WIN32)
#include <Windows.h>
#else
#include <unistd.h>
#include <time.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#if defined(_WIN32)
double clockMultiplier = 0.0;
char const platform[] = "WIN32";
void init_clock() {
int64_t freq = 0;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
clockMultiplier = 1.0 / double(freq);
}
double read_clock() {
int64_t ctr = 0;
QueryPerformanceCounter((LARGE_INTEGER*)&ctr);
return double(ctr) * clockMultiplier;
}
#else
char const platform[] = "POSIX";
void init_clock() {
}
double read_clock() {
struct timespec tm;
clock_gettime(CLOCK_MONOTONIC, &tm);
return tm.tv_sec + 1e-9 * tm.tv_nsec;
}
#endif
constexpr int numIterations = 100 * 1000 * 1000;
void* pointers[numIterations];
#if defined(NDEBUG)
char config[] = "RELEASE";
#else
char config[] = "DEBUG";
#endif
int main()
{
init_clock();
// Simple linear congruential random number generator, to make it exactly
// the same across platforms. The quality of this RNG doesn't matter for
// what we're measuring.
uint32_t rng = 15485863;
int allocFront = 0;
int freeFront = 0;
double startTime = read_clock();
while (freeFront < numIterations) {
// some prime factors go into this RNG -- again, quality is good enough
rng = rng * 40003409 + 100003313;
// before we get to end, allocate 3x more than we deallocate
if (rng & 0xc000) {
if (allocFront < numIterations) {
// use a few different block sizes, each aligned to 32 bytes
pointers[allocFront] = malloc(32UL + ((rng & 0x600) >> 4));
allocFront++;
}
}
else if (freeFront < allocFront) {
free(pointers[freeFront]);
freeFront++;
}
}
double endTime = read_clock();
printf("%s %s: %d iterations, %f seconds: %.3f ns/iter\n", platform, config, numIterations, endTime - startTime, 1e9 * (endTime - startTime) / double(numIterations));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment