Last active
December 9, 2020 14:49
-
-
Save madmann91/a12d5c65bfb33573b0f8509d9042bec5 to your computer and use it in GitHub Desktop.
Benchmark for devilutionX's RenderLine function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// g++ bench_renderline.cpp -std=c++17 -O3 -march=native -o bench_renderline | |
// The benchmarked functions (RenderLine, ...) where taken from | |
// diasurgical/devilutionX | |
// Copyrights to their respective authors, license is "The Unlicense" | |
#include <cstdint> | |
#include <cstddef> | |
#include <cstring> | |
#include <climits> | |
#include <cassert> | |
#include <iostream> | |
#include <chrono> | |
#include <random> | |
#include <vector> | |
#include <algorithm> | |
#include <string_view> | |
// Uncomment to see the markers in the generated assembly | |
//#define ASM_MARKER(i) asm("nop ; ------ BENCHMARK ("#i") BEGIN --------"); | |
#define ASM_MARKER(i) | |
static constexpr size_t TILE_HEIGHT = 32; | |
static constexpr size_t TILE_WIDTH = 64; | |
using DWORD = uint32_t; | |
using BYTE = uint8_t; | |
int light_table_index = 0; | |
char lightmax = 2; | |
enum { | |
RT_SQUARE, | |
RT_TRANSPARENT, | |
RT_LTRIANGLE, | |
RT_RTRIANGLE, | |
RT_LTRAPEZOID, | |
RT_RTRAPEZOID | |
}; | |
/** Specifies the draw masks used to render transparency of the right side of tiles. */ | |
static DWORD RightMask[TILE_HEIGHT] = { | |
0xEAAAAAAA, 0xF5555555, | |
0xFEAAAAAA, 0xFF555555, | |
0xFFEAAAAA, 0xFFF55555, | |
0xFFFEAAAA, 0xFFFF5555, | |
0xFFFFEAAA, 0xFFFFF555, | |
0xFFFFFEAA, 0xFFFFFF55, | |
0xFFFFFFEA, 0xFFFFFFF5, | |
0xFFFFFFFE, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF | |
}; | |
/** Specifies the draw masks used to render transparency of the left side of tiles. */ | |
static DWORD LeftMask[TILE_HEIGHT] = { | |
0xAAAAAAAB, 0x5555555F, | |
0xAAAAAABF, 0x555555FF, | |
0xAAAAABFF, 0x55555FFF, | |
0xAAAABFFF, 0x5555FFFF, | |
0xAAABFFFF, 0x555FFFFF, | |
0xAABFFFFF, 0x55FFFFFF, | |
0xABFFFFFF, 0x5FFFFFFF, | |
0xBFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF | |
}; | |
/** Specifies the draw masks used to render transparency of wall tiles. */ | |
static DWORD WallMask[TILE_HEIGHT] = { | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555, | |
0xAAAAAAAA, 0x55555555 | |
}; | |
static DWORD SolidMask[TILE_HEIGHT] = { | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF, | |
0xFFFFFFFF, 0xFFFFFFFF | |
}; | |
static DWORD RightFoliageMask[TILE_HEIGHT] = { | |
0xFFFFFFFF, 0x3FFFFFFF, | |
0x0FFFFFFF, 0x03FFFFFF, | |
0x00FFFFFF, 0x003FFFFF, | |
0x000FFFFF, 0x0003FFFF, | |
0x0000FFFF, 0x00003FFF, | |
0x00000FFF, 0x000003FF, | |
0x000000FF, 0x0000003F, | |
0x0000000F, 0x00000003, | |
0x00000000, 0x00000003, | |
0x0000000F, 0x0000003F, | |
0x000000FF, 0x000003FF, | |
0x00000FFF, 0x00003FFF, | |
0x0000FFFF, 0x0003FFFF, | |
0x000FFFFF, 0x003FFFFF, | |
0x00FFFFFF, 0x03FFFFFF, | |
0x0FFFFFFF, 0x3FFFFFFF, | |
}; | |
static DWORD LeftFoliageMask[TILE_HEIGHT] = { | |
0xFFFFFFFF, 0xFFFFFFFC, | |
0xFFFFFFF0, 0xFFFFFFC0, | |
0xFFFFFF00, 0xFFFFFC00, | |
0xFFFFF000, 0xFFFFC000, | |
0xFFFF0000, 0xFFFC0000, | |
0xFFF00000, 0xFFC00000, | |
0xFF000000, 0xFC000000, | |
0xF0000000, 0xC0000000, | |
0x00000000, 0xC0000000, | |
0xF0000000, 0xFC000000, | |
0xFF000000, 0xFFC00000, | |
0xFFF00000, 0xFFFC0000, | |
0xFFFF0000, 0xFFFFC000, | |
0xFFFFF000, 0xFFFFFC00, | |
0xFFFFFF00, 0xFFFFFFC0, | |
0xFFFFFFF0, 0xFFFFFFFC, | |
}; | |
inline static void OldRenderLine(BYTE **dst, BYTE **src, int n, BYTE *tbl, DWORD mask) | |
{ | |
int i; | |
if (mask == 0xFFFFFFFF) { | |
if (light_table_index == lightmax) { | |
memset(*dst, 0, n); | |
(*src) += n; | |
(*dst) += n; | |
} else if (light_table_index == 0) { | |
memcpy(*dst, *src, n); | |
(*src) += n; | |
(*dst) += n; | |
} else { | |
for (i = 0; i < n; i++, (*src)++, (*dst)++) { | |
(*dst)[0] = tbl[(*src)[0]]; | |
} | |
} | |
} else { | |
if (light_table_index == lightmax) { | |
(*src) += n; | |
for (i = 0; i < n; i++, (*dst)++, mask <<= 1) { | |
if (mask & 0x80000000) { | |
(*dst)[0] = 0; | |
} | |
} | |
} else if (light_table_index == 0) { | |
for (i = 0; i < n; i++, (*src)++, (*dst)++, mask <<= 1) { | |
if (mask & 0x80000000) { | |
(*dst)[0] = (*src)[0]; | |
} | |
} | |
} else { | |
for (i = 0; i < n; i++, (*src)++, (*dst)++, mask <<= 1) { | |
if (mask & 0x80000000) { | |
(*dst)[0] = tbl[(*src)[0]]; | |
} | |
} | |
} | |
} | |
} | |
template <bool UseBuiltin> | |
inline static int count_leading_zeros(DWORD mask) { | |
// Note: This function assumes that the argument is not zero, | |
// which means there is at least one bit set. | |
static_assert(sizeof(DWORD) == sizeof(uint32_t)); | |
if constexpr (UseBuiltin) { | |
#if defined(__GNUC__) || defined(__clang__) | |
return __builtin_clz(mask); | |
#else | |
static_assert(false); | |
#endif | |
} else { | |
// Count the number of leading zeros using binary search. | |
int n = 0; | |
if ((mask & 0xFFFF0000) == 0) n += 16, mask <<= 16; | |
if ((mask & 0xFF000000) == 0) n += 8, mask <<= 8; | |
if ((mask & 0xF0000000) == 0) n += 4, mask <<= 4; | |
if ((mask & 0xC0000000) == 0) n += 2, mask <<= 2; | |
if ((mask & 0x80000000) == 0) n += 1; | |
return n; | |
} | |
} | |
template <bool UseBuiltin, typename F> | |
void foreach_set_bit(DWORD mask, const F& f) { | |
int i = 0; | |
while (mask != 0) { | |
int z = count_leading_zeros<UseBuiltin>(mask); | |
i += z, mask <<= z; | |
for (; mask & 0x80000000; i++, mask <<= 1) | |
f(i); | |
} | |
} | |
template <bool UseBuiltin, bool WithBug> | |
inline static void NewRenderLine(BYTE **dst, BYTE **src, int n, BYTE *tbl, DWORD mask) { | |
if (mask == 0xFFFFFFFF) { | |
if (light_table_index == lightmax) { | |
memset(*dst, 0, n); | |
} else if (light_table_index == 0) { | |
memcpy(*dst, *src, n); | |
} else { | |
for (int i = 0; i < n; i++) { | |
(*dst)[i] = tbl[(*src)[i]]; | |
} | |
} | |
} else { | |
// The number of iterations is anyway limited by the size of the mask. | |
// So we can limit it by ANDing the mask with another mask that only keeps | |
// iterations that are lower than n. We can now avoid testing if i < n | |
// at every loop iteration. | |
assert(n != 0 && n <= sizeof(DWORD) * CHAR_BIT); | |
if constexpr (WithBug) { | |
mask &= ((((DWORD)1) << n) - 1) << ((sizeof(DWORD) * CHAR_BIT) - n); | |
} else { | |
mask &= DWORD(-1) << ((sizeof(DWORD) * CHAR_BIT) - n); | |
} | |
if (light_table_index == lightmax) { | |
foreach_set_bit<UseBuiltin>(mask, [=] (int i) { (*dst)[i] = 0; }); | |
} else if (light_table_index == 0) { | |
foreach_set_bit<UseBuiltin>(mask, [=] (int i) { (*dst)[i] = (*src)[i]; }); | |
} else { | |
foreach_set_bit<UseBuiltin>(mask, [=] (int i) { (*dst)[i] = tbl[(*src)[i]]; }); | |
} | |
} | |
skip: | |
(*src) += n; | |
(*dst) += n; | |
} | |
DWORD choose_mask(size_t i) { | |
static const DWORD* masks[] = { | |
RightMask, LeftMask, WallMask, SolidMask, RightFoliageMask, LeftFoliageMask | |
}; | |
return masks[i % 6][(i / 6) % TILE_HEIGHT]; | |
} | |
template <typename F> | |
void profile(const std::string_view& msg, F&& f) { | |
static constexpr size_t M = 100; | |
std::vector<size_t> timings(M); | |
for (size_t i = 0; i < M; ++i) { | |
auto t_begin = std::chrono::high_resolution_clock::now(); | |
f(); | |
auto t_end = std::chrono::high_resolution_clock::now(); | |
timings[i] = std::chrono::duration_cast<std::chrono::microseconds>(t_end - t_begin).count(); | |
} | |
std::sort(timings.begin(), timings.end()); | |
std::cout | |
<< msg | |
<< timings[M/4] << "/" << timings[M / 2] << "/" << timings[3 * M / 4] | |
<< "us (Q1/Q2(=med)/Q3)" << std::endl; | |
} | |
void init(BYTE* p, size_t n) { | |
std::random_device dev; | |
std::mt19937 gen; | |
std::uniform_int_distribution<> dist(0, 255); | |
for (size_t i = 0; i < n; ++i) | |
p[i] = dist(gen); | |
} | |
template <typename RenderLineFn> | |
void bench(BYTE* src, BYTE* dst, BYTE* tbl, RenderLineFn&& RenderLine) { | |
static constexpr size_t N = 100000; | |
// Test case 1: memset to 0 | |
ASM_MARKER(1) | |
init(src, TILE_WIDTH); | |
init(dst, TILE_WIDTH); | |
light_table_index = lightmax; | |
for (size_t i = 0; i < N; i++) { | |
BYTE* psrc = i & 1 ? src : dst; | |
BYTE* pdst = i & 1 ? dst : src; | |
RenderLine(&psrc, &pdst, 1 | i%32, tbl, -1); | |
} | |
// Test case 2: memcpy | |
ASM_MARKER(2) | |
init(src, TILE_WIDTH); | |
init(dst, TILE_WIDTH); | |
light_table_index = 0; | |
for (size_t i = 0; i < N; i++) { | |
BYTE* psrc = i & 1 ? src : dst; | |
BYTE* pdst = i & 1 ? dst : src; | |
RenderLine(&psrc, &pdst, 1 | i%32, tbl, -1); | |
} | |
// Test case 3: gather | |
ASM_MARKER(3) | |
init(src, TILE_WIDTH); | |
init(dst, TILE_WIDTH); | |
light_table_index = 1; | |
for (size_t i = 0; i < N; i++) { | |
BYTE* psrc = i & 1 ? src : dst; | |
BYTE* pdst = i & 1 ? dst : src; | |
RenderLine(&psrc, &pdst, 1 | i%32, tbl, -1); | |
} | |
// Test case 4: masked memset | |
ASM_MARKER(4) | |
init(src, TILE_WIDTH); | |
init(dst, TILE_WIDTH); | |
light_table_index = lightmax; | |
for (size_t i = 0; i < N; i++) { | |
BYTE* psrc = i & 1 ? src : dst; | |
BYTE* pdst = i & 1 ? dst : src; | |
RenderLine(&psrc, &pdst, 1 | i%32, tbl, choose_mask(i)); | |
} | |
// Test case 5: masked memcpy | |
ASM_MARKER(5) | |
init(src, TILE_WIDTH); | |
init(dst, TILE_WIDTH); | |
light_table_index = 0; | |
for (size_t i = 0; i < N; i++) { | |
BYTE* psrc = i & 1 ? src : dst; | |
BYTE* pdst = i & 1 ? dst : src; | |
RenderLine(&psrc, &pdst, 1 | i%32, tbl, choose_mask(i)); | |
} | |
// Test case 6: masked gather | |
ASM_MARKER(6) | |
init(src, TILE_WIDTH); | |
init(dst, TILE_WIDTH); | |
// Test case 6: masked gather | |
light_table_index = 1; | |
for (size_t i = 0; i < N; i++) { | |
BYTE* psrc = i & 1 ? src : dst; | |
BYTE* pdst = i & 1 ? dst : src; | |
RenderLine(&psrc, &pdst, 1 | i%32, tbl, choose_mask(i)); | |
} | |
} | |
int main() { | |
BYTE src[TILE_WIDTH]; | |
BYTE dst[TILE_WIDTH]; | |
BYTE tbl[256]; | |
init(tbl, 256); | |
profile("Old version: ", [&] { bench(src, dst, tbl, OldRenderLine); }); | |
profile("New version (fallback, with bug): ", [&] { bench(src, dst, tbl, NewRenderLine<false, true>); }); | |
profile("New version (fallback, fixed): ", [&] { bench(src, dst, tbl, NewRenderLine<false, false>); }); | |
profile("New version (builtin, with bug): ", [&] { bench(src, dst, tbl, NewRenderLine<true, true>); }); | |
profile("New version (builtin, fixed): ", [&] { bench(src, dst, tbl, NewRenderLine<true, false>); }); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment