Skip to content

Instantly share code, notes, and snippets.

@madmann91
Last active December 9, 2020 14:49
Show Gist options
  • Save madmann91/a12d5c65bfb33573b0f8509d9042bec5 to your computer and use it in GitHub Desktop.
Save madmann91/a12d5c65bfb33573b0f8509d9042bec5 to your computer and use it in GitHub Desktop.
Benchmark for devilutionX's RenderLine function
// g++ bench_renderline.cpp -std=c++17 -O3 -march=native -o bench_renderline
// The benchmarked functions (RenderLine, ...) where taken from
// diasurgical/devilutionX
// Copyrights to their respective authors, license is "The Unlicense"
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <climits>
#include <cassert>
#include <iostream>
#include <chrono>
#include <random>
#include <vector>
#include <algorithm>
#include <string_view>
// Uncomment to see the markers in the generated assembly
//#define ASM_MARKER(i) asm("nop ; ------ BENCHMARK ("#i") BEGIN --------");
#define ASM_MARKER(i)
static constexpr size_t TILE_HEIGHT = 32;
static constexpr size_t TILE_WIDTH = 64;
using DWORD = uint32_t;
using BYTE = uint8_t;
int light_table_index = 0;
char lightmax = 2;
enum {
RT_SQUARE,
RT_TRANSPARENT,
RT_LTRIANGLE,
RT_RTRIANGLE,
RT_LTRAPEZOID,
RT_RTRAPEZOID
};
/** Specifies the draw masks used to render transparency of the right side of tiles. */
static DWORD RightMask[TILE_HEIGHT] = {
0xEAAAAAAA, 0xF5555555,
0xFEAAAAAA, 0xFF555555,
0xFFEAAAAA, 0xFFF55555,
0xFFFEAAAA, 0xFFFF5555,
0xFFFFEAAA, 0xFFFFF555,
0xFFFFFEAA, 0xFFFFFF55,
0xFFFFFFEA, 0xFFFFFFF5,
0xFFFFFFFE, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF
};
/** Specifies the draw masks used to render transparency of the left side of tiles. */
static DWORD LeftMask[TILE_HEIGHT] = {
0xAAAAAAAB, 0x5555555F,
0xAAAAAABF, 0x555555FF,
0xAAAAABFF, 0x55555FFF,
0xAAAABFFF, 0x5555FFFF,
0xAAABFFFF, 0x555FFFFF,
0xAABFFFFF, 0x55FFFFFF,
0xABFFFFFF, 0x5FFFFFFF,
0xBFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF
};
/** Specifies the draw masks used to render transparency of wall tiles. */
static DWORD WallMask[TILE_HEIGHT] = {
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555,
0xAAAAAAAA, 0x55555555
};
static DWORD SolidMask[TILE_HEIGHT] = {
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF
};
static DWORD RightFoliageMask[TILE_HEIGHT] = {
0xFFFFFFFF, 0x3FFFFFFF,
0x0FFFFFFF, 0x03FFFFFF,
0x00FFFFFF, 0x003FFFFF,
0x000FFFFF, 0x0003FFFF,
0x0000FFFF, 0x00003FFF,
0x00000FFF, 0x000003FF,
0x000000FF, 0x0000003F,
0x0000000F, 0x00000003,
0x00000000, 0x00000003,
0x0000000F, 0x0000003F,
0x000000FF, 0x000003FF,
0x00000FFF, 0x00003FFF,
0x0000FFFF, 0x0003FFFF,
0x000FFFFF, 0x003FFFFF,
0x00FFFFFF, 0x03FFFFFF,
0x0FFFFFFF, 0x3FFFFFFF,
};
static DWORD LeftFoliageMask[TILE_HEIGHT] = {
0xFFFFFFFF, 0xFFFFFFFC,
0xFFFFFFF0, 0xFFFFFFC0,
0xFFFFFF00, 0xFFFFFC00,
0xFFFFF000, 0xFFFFC000,
0xFFFF0000, 0xFFFC0000,
0xFFF00000, 0xFFC00000,
0xFF000000, 0xFC000000,
0xF0000000, 0xC0000000,
0x00000000, 0xC0000000,
0xF0000000, 0xFC000000,
0xFF000000, 0xFFC00000,
0xFFF00000, 0xFFFC0000,
0xFFFF0000, 0xFFFFC000,
0xFFFFF000, 0xFFFFFC00,
0xFFFFFF00, 0xFFFFFFC0,
0xFFFFFFF0, 0xFFFFFFFC,
};
inline static void OldRenderLine(BYTE **dst, BYTE **src, int n, BYTE *tbl, DWORD mask)
{
int i;
if (mask == 0xFFFFFFFF) {
if (light_table_index == lightmax) {
memset(*dst, 0, n);
(*src) += n;
(*dst) += n;
} else if (light_table_index == 0) {
memcpy(*dst, *src, n);
(*src) += n;
(*dst) += n;
} else {
for (i = 0; i < n; i++, (*src)++, (*dst)++) {
(*dst)[0] = tbl[(*src)[0]];
}
}
} else {
if (light_table_index == lightmax) {
(*src) += n;
for (i = 0; i < n; i++, (*dst)++, mask <<= 1) {
if (mask & 0x80000000) {
(*dst)[0] = 0;
}
}
} else if (light_table_index == 0) {
for (i = 0; i < n; i++, (*src)++, (*dst)++, mask <<= 1) {
if (mask & 0x80000000) {
(*dst)[0] = (*src)[0];
}
}
} else {
for (i = 0; i < n; i++, (*src)++, (*dst)++, mask <<= 1) {
if (mask & 0x80000000) {
(*dst)[0] = tbl[(*src)[0]];
}
}
}
}
}
template <bool UseBuiltin>
inline static int count_leading_zeros(DWORD mask) {
// Note: This function assumes that the argument is not zero,
// which means there is at least one bit set.
static_assert(sizeof(DWORD) == sizeof(uint32_t));
if constexpr (UseBuiltin) {
#if defined(__GNUC__) || defined(__clang__)
return __builtin_clz(mask);
#else
static_assert(false);
#endif
} else {
// Count the number of leading zeros using binary search.
int n = 0;
if ((mask & 0xFFFF0000) == 0) n += 16, mask <<= 16;
if ((mask & 0xFF000000) == 0) n += 8, mask <<= 8;
if ((mask & 0xF0000000) == 0) n += 4, mask <<= 4;
if ((mask & 0xC0000000) == 0) n += 2, mask <<= 2;
if ((mask & 0x80000000) == 0) n += 1;
return n;
}
}
template <bool UseBuiltin, typename F>
void foreach_set_bit(DWORD mask, const F& f) {
int i = 0;
while (mask != 0) {
int z = count_leading_zeros<UseBuiltin>(mask);
i += z, mask <<= z;
for (; mask & 0x80000000; i++, mask <<= 1)
f(i);
}
}
template <bool UseBuiltin, bool WithBug>
inline static void NewRenderLine(BYTE **dst, BYTE **src, int n, BYTE *tbl, DWORD mask) {
if (mask == 0xFFFFFFFF) {
if (light_table_index == lightmax) {
memset(*dst, 0, n);
} else if (light_table_index == 0) {
memcpy(*dst, *src, n);
} else {
for (int i = 0; i < n; i++) {
(*dst)[i] = tbl[(*src)[i]];
}
}
} else {
// The number of iterations is anyway limited by the size of the mask.
// So we can limit it by ANDing the mask with another mask that only keeps
// iterations that are lower than n. We can now avoid testing if i < n
// at every loop iteration.
assert(n != 0 && n <= sizeof(DWORD) * CHAR_BIT);
if constexpr (WithBug) {
mask &= ((((DWORD)1) << n) - 1) << ((sizeof(DWORD) * CHAR_BIT) - n);
} else {
mask &= DWORD(-1) << ((sizeof(DWORD) * CHAR_BIT) - n);
}
if (light_table_index == lightmax) {
foreach_set_bit<UseBuiltin>(mask, [=] (int i) { (*dst)[i] = 0; });
} else if (light_table_index == 0) {
foreach_set_bit<UseBuiltin>(mask, [=] (int i) { (*dst)[i] = (*src)[i]; });
} else {
foreach_set_bit<UseBuiltin>(mask, [=] (int i) { (*dst)[i] = tbl[(*src)[i]]; });
}
}
skip:
(*src) += n;
(*dst) += n;
}
DWORD choose_mask(size_t i) {
static const DWORD* masks[] = {
RightMask, LeftMask, WallMask, SolidMask, RightFoliageMask, LeftFoliageMask
};
return masks[i % 6][(i / 6) % TILE_HEIGHT];
}
template <typename F>
void profile(const std::string_view& msg, F&& f) {
static constexpr size_t M = 100;
std::vector<size_t> timings(M);
for (size_t i = 0; i < M; ++i) {
auto t_begin = std::chrono::high_resolution_clock::now();
f();
auto t_end = std::chrono::high_resolution_clock::now();
timings[i] = std::chrono::duration_cast<std::chrono::microseconds>(t_end - t_begin).count();
}
std::sort(timings.begin(), timings.end());
std::cout
<< msg
<< timings[M/4] << "/" << timings[M / 2] << "/" << timings[3 * M / 4]
<< "us (Q1/Q2(=med)/Q3)" << std::endl;
}
void init(BYTE* p, size_t n) {
std::random_device dev;
std::mt19937 gen;
std::uniform_int_distribution<> dist(0, 255);
for (size_t i = 0; i < n; ++i)
p[i] = dist(gen);
}
template <typename RenderLineFn>
void bench(BYTE* src, BYTE* dst, BYTE* tbl, RenderLineFn&& RenderLine) {
static constexpr size_t N = 100000;
// Test case 1: memset to 0
ASM_MARKER(1)
init(src, TILE_WIDTH);
init(dst, TILE_WIDTH);
light_table_index = lightmax;
for (size_t i = 0; i < N; i++) {
BYTE* psrc = i & 1 ? src : dst;
BYTE* pdst = i & 1 ? dst : src;
RenderLine(&psrc, &pdst, 1 | i%32, tbl, -1);
}
// Test case 2: memcpy
ASM_MARKER(2)
init(src, TILE_WIDTH);
init(dst, TILE_WIDTH);
light_table_index = 0;
for (size_t i = 0; i < N; i++) {
BYTE* psrc = i & 1 ? src : dst;
BYTE* pdst = i & 1 ? dst : src;
RenderLine(&psrc, &pdst, 1 | i%32, tbl, -1);
}
// Test case 3: gather
ASM_MARKER(3)
init(src, TILE_WIDTH);
init(dst, TILE_WIDTH);
light_table_index = 1;
for (size_t i = 0; i < N; i++) {
BYTE* psrc = i & 1 ? src : dst;
BYTE* pdst = i & 1 ? dst : src;
RenderLine(&psrc, &pdst, 1 | i%32, tbl, -1);
}
// Test case 4: masked memset
ASM_MARKER(4)
init(src, TILE_WIDTH);
init(dst, TILE_WIDTH);
light_table_index = lightmax;
for (size_t i = 0; i < N; i++) {
BYTE* psrc = i & 1 ? src : dst;
BYTE* pdst = i & 1 ? dst : src;
RenderLine(&psrc, &pdst, 1 | i%32, tbl, choose_mask(i));
}
// Test case 5: masked memcpy
ASM_MARKER(5)
init(src, TILE_WIDTH);
init(dst, TILE_WIDTH);
light_table_index = 0;
for (size_t i = 0; i < N; i++) {
BYTE* psrc = i & 1 ? src : dst;
BYTE* pdst = i & 1 ? dst : src;
RenderLine(&psrc, &pdst, 1 | i%32, tbl, choose_mask(i));
}
// Test case 6: masked gather
ASM_MARKER(6)
init(src, TILE_WIDTH);
init(dst, TILE_WIDTH);
// Test case 6: masked gather
light_table_index = 1;
for (size_t i = 0; i < N; i++) {
BYTE* psrc = i & 1 ? src : dst;
BYTE* pdst = i & 1 ? dst : src;
RenderLine(&psrc, &pdst, 1 | i%32, tbl, choose_mask(i));
}
}
int main() {
BYTE src[TILE_WIDTH];
BYTE dst[TILE_WIDTH];
BYTE tbl[256];
init(tbl, 256);
profile("Old version: ", [&] { bench(src, dst, tbl, OldRenderLine); });
profile("New version (fallback, with bug): ", [&] { bench(src, dst, tbl, NewRenderLine<false, true>); });
profile("New version (fallback, fixed): ", [&] { bench(src, dst, tbl, NewRenderLine<false, false>); });
profile("New version (builtin, with bug): ", [&] { bench(src, dst, tbl, NewRenderLine<true, true>); });
profile("New version (builtin, fixed): ", [&] { bench(src, dst, tbl, NewRenderLine<true, false>); });
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment