Last active
January 13, 2019 11:43
-
-
Save hardbyte/831dc10e1026873a53285f5049d75806 to your computer and use it in GitHub Desktop.
Making sure Julia can count bits.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <memory> | |
#include <functional> | |
#include <algorithm> | |
#include <vector> | |
#include <queue> | |
#include <stdint.h> | |
#include <cstdlib> | |
#include <ctime> | |
#include <cassert> | |
#include <climits> | |
static constexpr int WORD_BYTES = sizeof(uint64_t); | |
/** | |
* The popcount of n elements of buf is the sum of c0, c1, c2, c3. | |
*/ | |
template<int n> | |
static inline void | |
popcount( | |
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, | |
const uint64_t *buf) { | |
popcount<4>(c0, c1, c2, c3, buf); | |
popcount<n - 4>(c0, c1, c2, c3, buf + 4); | |
} | |
// Fast Path | |
// | |
// Source: http://danluu.com/assembly-intrinsics/ | |
// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance | |
// | |
// NB: Dan Luu's original assembly (and the SO answer it was based on) | |
// is incorrect because it clobbers registers marked as "input only" | |
// (see warning at | |
// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands | |
// -- this mistake does not materialise with GCC (4.9), but it does | |
// with Clang (3.6 and 3.8)). We fix the mistake by explicitly | |
// loading the contents of buf into registers and using these same | |
// registers for the intermediate popcnts. | |
/** | |
* The popcount of 4 elements of buf is the sum of c0, c1, c2, c3. | |
*/ | |
template<> | |
inline void | |
popcount<4>( | |
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, | |
const uint64_t *buf) { | |
uint64_t b0, b1, b2, b3; | |
b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3]; | |
__asm__( | |
"popcnt %4, %4 \n\t" | |
"add %4, %0 \n\t" | |
"popcnt %5, %5 \n\t" | |
"add %5, %1 \n\t" | |
"popcnt %6, %6 \n\t" | |
"add %6, %2 \n\t" | |
"popcnt %7, %7 \n\t" | |
"add %7, %3 \n\t" | |
: "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3), | |
"+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); | |
} | |
// Slow paths | |
// TODO: Assumes sizeof(long) == 8 | |
// | |
// NB: The specialisation to n=3 is not currently used but included | |
// for completeness (i.e. so that popcount<n> is defined for all | |
// non-negative n) and in anticipation of its use in the near future. | |
#if 0 | |
template<> | |
inline void | |
popcount<3>( | |
uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, | |
const uint64_t *buf) { | |
c0 += __builtin_popcountl(buf[0]); | |
c1 += __builtin_popcountl(buf[1]); | |
c2 += __builtin_popcountl(buf[2]); | |
} | |
#endif | |
/** | |
* The popcount of 2 elements of buf is the sum of c0, c1. | |
*/ | |
template<> | |
inline void | |
popcount<2>( | |
uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, | |
const uint64_t *buf) { | |
c0 += __builtin_popcountl(buf[0]); | |
c1 += __builtin_popcountl(buf[1]); | |
} | |
/** | |
* The popcount *buf is in c0. | |
*/ | |
template<> | |
inline void | |
popcount<1>( | |
uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, | |
const uint64_t *buf) { | |
c0 += __builtin_popcountl(buf[0]); | |
} | |
/** | |
* Calculate population counts of an array of inputs of nwords elements. | |
* | |
* 'arrays' must point to narrays*nwords*WORD_BYTES bytes | |
* 'counts' must point to narrays*sizeof(uint32_t) bytes. | |
* For i = 0 to narrays - 1, the population count of the nwords elements | |
* | |
* arrays[i * nwords] ... arrays[(i + 1) * nwords - 1] | |
* | |
* is put in counts[i]. | |
*/ | |
template<int nwords> | |
static void | |
_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { | |
uint64_t c0, c1, c2, c3; | |
for (int i = 0; i < narrays; ++i, arrays += nwords) { | |
c0 = c1 = c2 = c3 = 0; | |
popcount<nwords>(c0, c1, c2, c3, arrays); | |
counts[i] = c0 + c1 + c2 + c3; | |
} | |
} | |
/** | |
* Return the popcount of the nwords elements starting at array. | |
*/ | |
static uint32_t | |
_popcount_array(const uint64_t *array, int nwords) { | |
uint64_t c0, c1, c2, c3; | |
c0 = c1 = c2 = c3 = 0; | |
while (nwords >= 16) { | |
popcount<16>(c0, c1, c2, c3, array); | |
array += 16; | |
nwords -= 16; | |
} | |
// nwords < 16 | |
if (nwords >= 8) { | |
popcount<8>(c0, c1, c2, c3, array); | |
array += 8; | |
nwords -= 8; | |
} | |
// nwords < 8 | |
if (nwords >= 4) { | |
popcount<4>(c0, c1, c2, c3, array); | |
array += 4; | |
nwords -= 4; | |
} | |
// nwords < 4 | |
if (nwords >= 2) { | |
popcount<2>(c0, c1, c2, c3, array); | |
array += 2; | |
nwords -= 2; | |
} | |
// nwords < 2 | |
if (nwords == 1) | |
popcount<1>(c0, c1, c2, c3, array); | |
return c0 + c1 + c2 + c3; | |
} | |
/** | |
* Convert clock measurement t to milliseconds. | |
* | |
* t should have been obtained as the difference of calls to clock() | |
* for this to make sense. | |
*/ | |
static inline double | |
to_millis(clock_t t) { | |
static constexpr double CPS = (double)CLOCKS_PER_SEC; | |
return t * 1.0E3 / CPS; | |
} | |
static inline uint32_t | |
abs_diff(uint32_t a, uint32_t b) { | |
if (a > b) | |
return a - b; | |
return b - a; | |
} | |
typedef const uint64_t word_tp; | |
typedef std::function<void (word_tp *)> deleter_fn; | |
typedef std::unique_ptr<word_tp, deleter_fn> word_ptr; | |
/** | |
* Given a char pointer ptr pointing to nbytes bytes, return a | |
* std::unique_ptr to uint64_t containing those same byte values | |
* (using the host's byte order, i.e. no byte reordering is done). | |
* | |
* The main purpose of this function is to fix pointer alignment | |
* issues when converting from a char pointer to a uint64 pointer; the | |
* issue being that a uint64 pointer has stricter alignment | |
* requirements than a char pointer. | |
* | |
* We assume that releasing the memory associated with ptr is someone | |
* else's problem. So, if ptr is already correctly aligned, we just | |
* cast it to uint64 and return a unique_ptr whose deleter is | |
* empty. If ptr is misaligned, then we copy the nbytes at ptr to a | |
* location that is correctly aligned and then return a unique_ptr | |
* whose deleter will delete that allocated space when the unique_ptr | |
* goes out of scope. | |
* | |
* NB: ASSUMES nbytes is divisible by WORD_BYTES. | |
* | |
* TODO: On some architectures it might be advantageous to have | |
* 16-byte aligned memory, even when the words used are only 8 bytes | |
* (this is to do with cache line size). This function could be easily | |
* modified to allow experimentation with different alignments. | |
*/ | |
static word_ptr | |
adjust_ptr_alignment(const char *ptr, size_t nbytes) { | |
static constexpr size_t wordbytes = sizeof(word_tp); | |
size_t nwords = nbytes / wordbytes; | |
uintptr_t addr = reinterpret_cast<uintptr_t>(ptr); | |
// ptr is correctly aligned if addr is divisible by wordbytes | |
if (addr % wordbytes != 0) { | |
// ptr is NOT correctly aligned | |
// copy_words has correct alignment | |
uint64_t *copy_words = new uint64_t[nwords]; | |
// alias copy_words with a byte pointer; the cast safe because | |
// uint8_t alignment is less restrictive than uint64_t | |
uint8_t *copy_bytes = reinterpret_cast<uint8_t *>(copy_words); | |
// copy everything across | |
std::copy(ptr, ptr + nbytes, copy_bytes); | |
// return a unique_ptr with array delete to delete copy_words | |
auto array_delete = [] (word_tp *p) { delete[] p; }; | |
return word_ptr(copy_words, array_delete); | |
} else { | |
// ptr IS correctly aligned | |
// safe cast because the address of ptr is wordbyte-aligned. | |
word_tp *same = reinterpret_cast<word_tp *>(ptr); | |
// don't delete backing array since we don't own it | |
auto empty_delete = [] (word_tp *) { }; | |
return word_ptr(same, empty_delete); | |
} | |
} | |
extern "C" | |
{ | |
/** | |
* Calculate population counts of an array of inputs; return how | |
* long it took in milliseconds. | |
* | |
* 'arrays' must point to narrays*array_bytes bytes | |
* 'counts' must point to narrays*sizeof(uint32_t) bytes. | |
* For i = 0 to n - 1, the population count of the array_bytes*8 bits | |
* | |
* arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1] | |
* | |
* is put in counts[i]. | |
* | |
* ASSUMES: array_bytes is divisible by 8. | |
*/ | |
double | |
popcount_arrays( | |
uint32_t *counts, | |
const char *arrays, int narrays, int array_bytes) { | |
// assumes WORD_BYTES divides array_bytes | |
int nwords = array_bytes / WORD_BYTES; | |
// The static_cast is to avoid int overflow in the multiplication | |
size_t total_bytes = static_cast<size_t>(narrays) * array_bytes; | |
auto ptr = adjust_ptr_alignment(arrays, total_bytes); | |
auto u = ptr.get(); | |
clock_t t = clock(); | |
switch (nwords) { | |
case 32: _popcount_arrays<32>(counts, u, narrays); break; | |
case 16: _popcount_arrays<16>(counts, u, narrays); break; | |
case 8: _popcount_arrays< 8>(counts, u, narrays); break; | |
default: | |
for (int i = 0; i < narrays; ++i, u += nwords) | |
counts[i] = _popcount_array(u, nwords); | |
} | |
return to_millis(clock() - t); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment