Last active
January 17, 2024 17:50
-
-
Save bonzini/9a95b0e02d1ceb60af9e to your computer and use it in GitHub Desktop.
Optimizing "is this memory block equal to zero?"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdbool.h> | |
#define N 100000 | |
char buf1[1024], buf2[1024], buf3[1024], buf4[65536]; | |
static __attribute__((noinline)) bool memeqzero1(const void *data, size_t length) | |
{ | |
const unsigned char *p = data; | |
while (length) { | |
if (*p) | |
return false; | |
p++; | |
length--; | |
} | |
return true; | |
} | |
static __attribute__((noinline)) bool memeqzero2(const void *data, size_t length) | |
{ | |
const unsigned char *p = data; | |
static unsigned long zeroes[16]; | |
while (length > sizeof(zeroes)) { | |
if (memcmp(zeroes, p, sizeof(zeroes))) | |
return false; | |
p += sizeof(zeroes); | |
length -= sizeof(zeroes); | |
} | |
return memcmp(zeroes, p, length) == 0; | |
} | |
static __attribute__((noinline)) bool memeqzero3_rusty(const void *data, size_t length) | |
{ | |
const unsigned char *p = data; | |
const unsigned long zero = 0; | |
size_t pre; | |
pre = (size_t)p % sizeof(unsigned long); | |
if (pre) { | |
size_t n = sizeof(unsigned long) - pre; | |
if (n > length) | |
n = length; | |
if (memcmp(p, &zero, n) != 0) | |
return false; | |
p += n; | |
length -= n; | |
} | |
while (length > sizeof(zero)) { | |
if (*(unsigned long *)p != zero) | |
return false; | |
p += sizeof(zero); | |
length -= sizeof(zero); | |
} | |
return memcmp(&zero, p, length) == 0; | |
} | |
static __attribute__((noinline)) bool memeqzero3_paolo(const void *data, size_t length) | |
{ | |
const unsigned char *p = data; | |
unsigned long word; | |
while (length & (sizeof(word) - 1)) { | |
if (*p) | |
return false; | |
p++; | |
length--; | |
} | |
while (length) { | |
memcpy(&word, p, sizeof(word)); | |
if (word) | |
return false; | |
p += sizeof(word); | |
length -= sizeof(word); | |
} | |
return true; | |
} | |
static __attribute__((noinline)) bool memeqzero4_rusty(const void *data, size_t length) | |
{ | |
const unsigned char *p = data; | |
size_t len; | |
/* Check first 16 bytes manually */ | |
for (len = 0; len < 16; len++) { | |
if (!length) | |
return true; | |
if (*p) | |
return false; | |
p++; | |
length--; | |
} | |
/* Now we know that's zero, memcmp with self. */ | |
return memcmp(data, p, length) == 0; | |
} | |
static __attribute__((noinline)) bool memeqzero4_paolo(const void *data, size_t length) | |
{ | |
const unsigned char *p = data; | |
unsigned long word; | |
if (!length) | |
return true; | |
while (__builtin_expect(length & (sizeof(word) - 1), 0)) { | |
if (*p) | |
return false; | |
p++; | |
length--; | |
if (!length) | |
return true; | |
} | |
/* We must always read one byte or word, even if everything is aligned! | |
* Otherwise, memcmp(data, data, length) is trivially true. | |
*/ | |
for (;;) { | |
memcpy(&word, p, sizeof(word)); | |
if (word) | |
return false; | |
if (__builtin_expect(length & (16 - sizeof(word)), 0) == 0) | |
break; | |
p += sizeof(word); | |
length -= sizeof(word); | |
if (!length) | |
return true; | |
} | |
/* Now we know that's zero, memcmp with self. */ | |
return memcmp(data, p, length) == 0; | |
} | |
static inline unsigned long rdtsc() { | |
unsigned long cycles; | |
asm volatile("rdtsc; shlq $32, %%rdx; movl %%eax, %%eax; orq %%rdx, %%rax " : "=A"(cycles)); | |
return cycles; | |
} | |
static int bench(char *buf, int size, bool(*memeqzero)(const void *, size_t)) | |
{ | |
int i = N; | |
int count; | |
unsigned long start = rdtsc(); | |
while(i--) asm volatile("" : : "r" (memeqzero(buf, size))); | |
unsigned long end = rdtsc(); | |
i = count = 3000000000.0 * N / (end - start); | |
start = rdtsc(); | |
while(i--) asm volatile("" : : "r" (memeqzero(buf, size))); | |
end = rdtsc(); | |
return (end - start) / count; | |
} | |
static __attribute__((__flatten__)) | |
int run(bool(*memeqzero)(const void *, size_t)) | |
{ | |
printf ("%d\t%d\t%d\t%d\n", | |
bench(buf1, 1, memeqzero), bench(buf2, 8, memeqzero), | |
bench(buf3, 512, memeqzero), bench(buf4, 65536, memeqzero)); | |
} | |
int main() | |
{ | |
run(memeqzero1); | |
run(memeqzero2); | |
run(memeqzero3_rusty); | |
run(memeqzero3_paolo); | |
run(memeqzero4_rusty); | |
run(memeqzero4_paolo); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
And what the result ?