Last active
June 30, 2021 16:55
-
-
Save bwedding/d97c6b1c61e28aca9973817b42a3e97f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <chrono> | |
#include <algorithm> | |
#include <functional> | |
#include <iostream> | |
#include <random> | |
#include <string> | |
// All this should be in a header, but I'm doing this for ease of use in a Gist | |
// Readability | |
using StrlenFunc = size_t(*const)(const char*); | |
using TimePointRef = std::chrono::time_point<std::chrono::steady_clock> &; | |
using TimePoint = std::chrono::high_resolution_clock::time_point; | |
using DurationDblRef = std::chrono::duration<double> &; | |
using DurationDbl = std::chrono::duration<double>; | |
// Prototypes | |
void GetTotalTime(DurationDblRef totalTime, TimePointRef startTime); | |
void TestStrlenFuncs(int loops, int len, std::string &bigString, StrlenFunc, char *); | |
std::string randString(int len); | |
void PrintResults(char *funcName, DurationDblRef totalTime); | |
// Functions to ttest | |
size_t strlenopt(const char *str); | |
size_t RickMeyerStrLen(const char *str); | |
size_t strlenHandrolled(const char *str); | |
size_t asmstrlen(const char *str); | |
size_t strchrAsStrlen(const char *str); | |
// Constants | |
const int MaxString = 0xfffff0; | |
const int LongStrLen = 0xfffff; | |
const int ShortStrLen = 0xff; | |
const int LongStrLoops = 5000; | |
const int ShortStrLoops = 100000; | |
int main() | |
{ | |
// Generate 1 random string to use for all loops | |
// but will be indexed into at various parts for | |
// variability | |
std::cout << "Creating the big random string" << std::endl; | |
std::string bigString = randString(MaxString); | |
// Adjust loops and string len for your system | |
std::cout << "Long strings of len " << LongStrLen << std::endl; | |
TestStrlenFuncs(LongStrLoops, LongStrLen, bigString, asmstrlen, "asmstrlen"); | |
TestStrlenFuncs(LongStrLoops, LongStrLen, bigString, strchrAsStrlen, "strchrAsStrlen"); | |
TestStrlenFuncs(LongStrLoops, LongStrLen, bigString, std::strlen, "std::strlen"); | |
TestStrlenFuncs(LongStrLoops, LongStrLen, bigString, strlenopt, "GLib strlen"); | |
TestStrlenFuncs(LongStrLoops, LongStrLen, bigString, strlenHandrolled, "Handrolled"); | |
TestStrlenFuncs(LongStrLoops, LongStrLen, bigString, RickMeyerStrLen, "RMeyerStrLen"); | |
// Test short strings | |
std::cout << std::endl << "Short strings of len " << ShortStrLen << std::endl; | |
TestStrlenFuncs(ShortStrLoops, ShortStrLen, bigString, asmstrlen, "asmstrlen"); | |
TestStrlenFuncs(ShortStrLoops, ShortStrLen, bigString, strchrAsStrlen, "strchrAsStrlen"); | |
TestStrlenFuncs(ShortStrLoops, ShortStrLen, bigString, std::strlen, "std::strlen"); | |
TestStrlenFuncs(ShortStrLoops, ShortStrLen, bigString, strlenopt, "GLib strlen"); | |
TestStrlenFuncs(ShortStrLoops, ShortStrLen, bigString, strlenHandrolled, "Handrolled"); | |
TestStrlenFuncs(ShortStrLoops, ShortStrLen, bigString, RickMeyerStrLen, "RMeyerStrLen"); | |
return 0; | |
} | |
// Return time in seconds as a double. | |
static TimePoint GetTime(void) | |
{ | |
return std::chrono::high_resolution_clock::now(); | |
} | |
/* Return the length of the null-terminated string STR. Scan for | |
the null terminator quickly by testing four bytes at a time. */ | |
size_t strlenopt(const char *str) | |
{ | |
const char *char_ptr; | |
const unsigned long int *longword_ptr; | |
unsigned long int longword, himagic, lomagic; | |
/* Handle the first few characters by reading one character at a time. | |
Do this until CHAR_PTR is aligned on a longword boundary. */ | |
for (char_ptr = str; ((unsigned long int) char_ptr | |
& (sizeof(longword) - 1)) != 0; | |
++char_ptr) | |
if (*char_ptr == '\0') | |
return char_ptr - str; | |
/* All these elucidatory comments refer to 4-byte longwords, | |
but the theory applies equally well to 8-byte longwords. */ | |
longword_ptr = (unsigned long int *) char_ptr; | |
/* Bits 31, 24, 16, and 8 of this number are zero. Call these bits | |
the "holes." Note that there is a hole just to the left of | |
each byte, with an extra at the end: | |
bits: 01111110 11111110 11111110 11111111 | |
bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD | |
The 1-bits make sure that carries propagate to the next 0-bit. | |
The 0-bits provide holes for carries to fall into. */ | |
himagic = 0x80808080L; | |
lomagic = 0x01010101L; | |
if (sizeof(longword) > 4) | |
{ | |
/* 64-bit version of the magic. */ | |
/* Do the shift in two steps to avoid a warning if long has 32 bits. */ | |
himagic = ((himagic << 16) << 16) | himagic; | |
lomagic = ((lomagic << 16) << 16) | lomagic; | |
} | |
if (sizeof(longword) > 8) | |
abort(); | |
/* Instead of the traditional loop which tests each character, | |
we will test a longword at a time. The tricky part is testing | |
if *any of the four* bytes in the longword in question are zero. */ | |
for (;;) | |
{ | |
longword = *longword_ptr++; | |
if (((longword - lomagic) & ~longword & himagic) != 0) | |
{ | |
/* Which of the bytes was the zero? If none of them were, it was | |
a misfire; continue the search. */ | |
const char *cp = (const char *)(longword_ptr - 1); | |
if (cp[0] == 0) | |
return cp - str; | |
if (cp[1] == 0) | |
return cp - str + 1; | |
if (cp[2] == 0) | |
return cp - str + 2; | |
if (cp[3] == 0) | |
return cp - str + 3; | |
if (sizeof(longword) > 4) | |
{ | |
if (cp[4] == 0) | |
return cp - str + 4; | |
if (cp[5] == 0) | |
return cp - str + 5; | |
if (cp[6] == 0) | |
return cp - str + 6; | |
if (cp[7] == 0) | |
return cp - str + 7; | |
} | |
} | |
} | |
} | |
size_t strlenHandrolled(const char *str) | |
{ | |
const char *p; | |
if (str == NULL) | |
return 0; | |
p = str; | |
while (*p != '\0') | |
p++; | |
return p - str; | |
} | |
size_t strchrAsStrlen(const char *str) | |
{ | |
const char *p = strchr(str, '\0'); | |
return p - str; | |
} | |
size_t RickMeyerStrLen(const char* p) | |
{ | |
auto q = p; | |
while (*q) | |
q++; | |
return q - p; | |
} | |
std::string randString(int len) | |
{ | |
const char*deedle = "|/-\\"; | |
std::mt19937_64 gen{ std::random_device{}() }; | |
std::uniform_int_distribution<short> dist{ 'a', 'z' }; | |
const unsigned length = len + dist(gen); | |
std::string str(length, '\0'); | |
int i = 0, j = 0; | |
for (auto& c : str) | |
{ | |
c = dist(gen); | |
// Let the user know we're working here | |
if (j++ > 1000) | |
{ | |
std::cout << deedle[i++ % strlen(deedle)] << '\b'; | |
j = 0; | |
} | |
} | |
return str; | |
} | |
void TestStrlenFuncs(int loops, int len, std::string &bigString, StrlenFunc func, char* printStr) | |
{ | |
// Don't blow up | |
_ASSERT(len + 1000 < MaxString); | |
std::string t; | |
std::random_device rd; // Will be used to obtain a seed for the random number engine | |
std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() | |
std::uniform_int_distribution<> dis(1, 1000); | |
DurationDbl totalTime; | |
volatile size_t res; // volatile just to ensure there's no optimization | |
auto startTime = GetTime(); | |
// So what we're doing here is getting a random number between 1 and 1000 | |
// to use as an offset into the big random string we created earlier. | |
// Then we'll get a sub string from that offset up to the len specified | |
// by the caller. We'll call the function to benchmark with this substring. | |
// This way, every function call is getting a different string and no optimization | |
// can affect our results. | |
// Creating these substrings ads to the overall benchmark time, but every function | |
// uses the same code so the overhead shouldn't matter in the test results. | |
for (int j = 0; j < loops; j++) | |
{ | |
int offset = dis(gen); // Get a random number to use as an index into bigString | |
t = bigString.substr(offset, len); | |
res = func(t.c_str()); | |
} | |
GetTotalTime(totalTime, startTime); | |
PrintResults(printStr, totalTime); | |
} | |
void GetTotalTime(DurationDblRef totalTime, TimePointRef startTime) | |
{ | |
totalTime = std::chrono::duration_cast<DurationDbl>(GetTime() - startTime); | |
} | |
void PrintResults(char *funcName, DurationDblRef totalTime) | |
{ | |
std::cout << funcName << ":\t\t\t" << totalTime.count() << " Seconds\n"; | |
} | |
#ifndef WORDS_BIGENDIAN | |
#if 0 | |
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes | |
{ | |
register int i = 0; | |
if (!(x & (1 << 0))) i++; | |
else return i; | |
if (!(x & (1 << 1))) i++; | |
else return i; | |
if (!(x & (1 << 2))) i++; | |
else return i; | |
if (!(x & (1 << 3))) i++; | |
else return i; | |
if (!(x & (1 << 4))) i++; | |
else return i; | |
if (!(x & (1 << 5))) i++; | |
else return i; | |
if (!(x & (1 << 6))) i++; | |
else return i; | |
if (!(x & (1 << 7))) i++; | |
else return i; | |
if (!(x & (1 << 8))) i++; | |
else return i; | |
if (!(x & (1 << 9))) i++; | |
else return i; | |
if (!(x & (1 << 10))) i++; | |
else return i; | |
if (!(x & (1 << 11))) i++; | |
else return i; | |
if (!(x & (1 << 12))) i++; | |
else return i; | |
if (!(x & (1 << 13))) i++; | |
else return i; | |
if (!(x & (1 << 14))) i++; | |
else return i; | |
if (!(x & (1 << 15))) i++; | |
return i; | |
} | |
#elif 0 | |
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes | |
{ | |
// http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski | |
register int n = 1; | |
if ((x & 0x000000FFU) == 0) { n += 8; x >>= 8; } | |
if ((x & 0x0000000FU) == 0) { n += 4; x >>= 4; } | |
if ((x & 0x00000003U) == 0) { n += 2; x >>= 2; } | |
return n - (x & 1); | |
} | |
#else | |
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40 | |
{ // this is current winner for speed | |
static const unsigned char table[256] = | |
{ | |
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, | |
}; | |
if ((unsigned char)x) | |
return table[(unsigned char)x]; | |
return table[x >> 8] + 8; // t[x / 256] + 8 | |
} | |
#endif | |
#else | |
#if 0 | |
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes | |
{ | |
register int i = 0; | |
if (!(x & (1 << 15))) i++; | |
else return i; | |
if (!(x & (1 << 14))) i++; | |
else return i; | |
if (!(x & (1 << 13))) i++; | |
else return i; | |
if (!(x & (1 << 12))) i++; | |
else return i; | |
if (!(x & (1 << 11))) i++; | |
else return i; | |
if (!(x & (1 << 10))) i++; | |
else return i; | |
if (!(x & (1 << 9))) i++; | |
else return i; | |
if (!(x & (1 << 8))) i++; | |
else return i; | |
if (!(x & (1 << 7))) i++; | |
else return i; | |
if (!(x & (1 << 6))) i++; | |
else return i; | |
if (!(x & (1 << 5))) i++; | |
else return i; | |
if (!(x & (1 << 4))) i++; | |
else return i; | |
if (!(x & (1 << 3))) i++; | |
else return i; | |
if (!(x & (1 << 2))) i++; | |
else return i; | |
if (!(x & (1 << 1))) i++; | |
else return i; | |
if (!(x & (1 << 0))) i++; | |
return i; | |
} | |
#else | |
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes | |
{ | |
// http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask | |
register int n = 0; | |
if (x <= 0x000000FFU) { n = n + 8; x = x << 8; } | |
if (x <= 0x00000FFFU) { n = n + 4; x = x << 4; } | |
if (x <= 0x00003FFFU) { n = n + 2; x = x << 2; } | |
if (x <= 0x00007FFFU) { n = n + 1; } | |
return n; | |
} | |
#endif | |
#endif | |
size_t asmstrlen(const char *str) | |
{ | |
register size_t len = 0; | |
// align to 16 bytes | |
while ((((intptr_t)str) & (sizeof(__m128i) - 1)) != 0) | |
{ | |
if (*str++ == 0) | |
return len; | |
++len; | |
} | |
// search for 0 | |
__m128i xmm0 = _mm_setzero_si128(); | |
__m128i xmm1; | |
int mask = 0; | |
for (;;) | |
{ | |
xmm1 = _mm_load_si128((__m128i *)str); | |
xmm1 = _mm_cmpeq_epi8(xmm1, xmm0); | |
if ((mask = _mm_movemask_epi8(xmm1)) != 0) | |
{ | |
// got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask | |
// find index of first set bit | |
#ifndef _DISABLE_ASM_BSF // define it to disable ASM | |
#if (_MSC_VER >= 1300) // make sure <intrin.h> is included | |
unsigned long pos; | |
_BitScanForward(&pos, mask); | |
len += (size_t)pos; | |
#elif defined(_MSC_VER) // earlier MSVC's do not have _BitScanForward, use inline asm | |
__asm bsf edx, mask; edx = bsf(mask) | |
__asm add edx, len; edx += len | |
__asm mov len, edx; len = edx | |
#elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz | |
len += __builtin_ctz(mask); | |
#elif defined(__GNUC__) // older GCC shall use inline asm | |
unsigned int pos; | |
asm("bsf %1, %0" : "=r" (pos) : "rm" (mask)); | |
len += (size_t)pos; | |
#else // none of choices exist, use local BSF implementation | |
len += count_bits_to_0(mask); | |
#endif | |
#else | |
len += count_bits_to_0(mask); | |
#endif | |
break; | |
} | |
str += sizeof(__m128i); | |
len += sizeof(__m128i); | |
} | |
return len; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment