hardbyte · January 13, 2019 11:43
diff --git a/_countingbits.so b/_countingbits.so
diff --git a/countingbits.cpp b/countingbits.cpp
 #include <memory>
 #include <functional>
 #include <algorithm>
 #include <vector>
 #include <queue>
 #include <stdint.h>
 #include <cstdlib>
 #include <ctime>
 #include <cassert>
 #include <climits>

 static constexpr int WORD_BYTES = sizeof(uint64_t);

 /**
 * The popcount of n elements of buf is the sum of c0, c1, c2, c3.
 */
 template<int n>
 static inline void
 popcount(
        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
        const uint64_t *buf) {
    popcount<4>(c0, c1, c2, c3, buf);
    popcount<n - 4>(c0, c1, c2, c3, buf + 4);
 }

 // Fast Path
 //
 // Source: http://danluu.com/assembly-intrinsics/
 // https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
 //
 // NB: Dan Luu's original assembly (and the SO answer it was based on)
 // is incorrect because it clobbers registers marked as "input only"
 // (see warning at
 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
 // -- this mistake does not materialise with GCC (4.9), but it does
 // with Clang (3.6 and 3.8)).  We fix the mistake by explicitly
 // loading the contents of buf into registers and using these same
 // registers for the intermediate popcnts.
 /**
 * The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
 */
 template<>
 inline void
 popcount<4>(
        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
        const uint64_t *buf) {
    uint64_t b0, b1, b2, b3;
    b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
    __asm__(
        "popcnt %4, %4  \n\t"
        "add %4, %0     \n\t"
        "popcnt %5, %5  \n\t"
        "add %5, %1     \n\t"
        "popcnt %6, %6  \n\t"
        "add %6, %2     \n\t"
        "popcnt %7, %7  \n\t"
        "add %7, %3     \n\t"
        : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
          "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
 }

 // Slow paths
 // TODO: Assumes sizeof(long) == 8
 //
 // NB: The specialisation to n=3 is not currently used but included
 // for completeness (i.e. so that popcount<n> is defined for all
 // non-negative n) and in anticipation of its use in the near future.
 #if 0
 template<>
 inline void
 popcount<3>(
        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
        const uint64_t *buf) {
    c0 += __builtin_popcountl(buf[0]);
    c1 += __builtin_popcountl(buf[1]);
    c2 += __builtin_popcountl(buf[2]);
 }
 #endif

 /**
 * The popcount of 2 elements of buf is the sum of c0, c1.
 */
 template<>
 inline void
 popcount<2>(
        uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
        const uint64_t *buf) {
    c0 += __builtin_popcountl(buf[0]);
    c1 += __builtin_popcountl(buf[1]);
 }

 /**
 * The popcount *buf is in c0.
 */
 template<>
 inline void
 popcount<1>(
        uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
        const uint64_t *buf) {
    c0 += __builtin_popcountl(buf[0]);
 }


 /**
 * Calculate population counts of an array of inputs of nwords elements.
 *
 * 'arrays' must point to narrays*nwords*WORD_BYTES bytes
 * 'counts' must point to narrays*sizeof(uint32_t) bytes.
 * For i = 0 to narrays - 1, the population count of the nwords elements
 *
 *   arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
 *
 * is put in counts[i].
 */
 template<int nwords>
 static void
 _popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
    uint64_t c0, c1, c2, c3;
    for (int i = 0; i < narrays; ++i, arrays += nwords) {
        c0 = c1 = c2 = c3 = 0;
        popcount<nwords>(c0, c1, c2, c3, arrays);
        counts[i] = c0 + c1 + c2 + c3;
    }
 }

 /**
 * Return the popcount of the nwords elements starting at array.
 */
 static uint32_t
 _popcount_array(const uint64_t *array, int nwords) {
    uint64_t c0, c1, c2, c3;
    c0 = c1 = c2 = c3 = 0;

    while (nwords >= 16) {
        popcount<16>(c0, c1, c2, c3, array);
        array += 16;
        nwords -= 16;
    }
    // nwords < 16
    if (nwords >= 8) {
        popcount<8>(c0, c1, c2, c3, array);
        array += 8;
        nwords -= 8;
    }
    // nwords < 8
    if (nwords >= 4) {
        popcount<4>(c0, c1, c2, c3, array);
        array += 4;
        nwords -= 4;
    }
    // nwords < 4
    if (nwords >= 2) {
        popcount<2>(c0, c1, c2, c3, array);
        array += 2;
        nwords -= 2;
    }
    // nwords < 2
    if (nwords == 1)
        popcount<1>(c0, c1, c2, c3, array);
    return c0 + c1 + c2 + c3;
 }

 /**
 * Convert clock measurement t to milliseconds.
 *
 * t should have been obtained as the difference of calls to clock()
 * for this to make sense.
 */
 static inline double
 to_millis(clock_t t) {
    static constexpr double CPS = (double)CLOCKS_PER_SEC;
    return t * 1.0E3 / CPS;
 }

 static inline uint32_t
 abs_diff(uint32_t a, uint32_t b) {
    if (a > b)
        return a - b;
    return b - a;
 }


 typedef const uint64_t word_tp;
 typedef std::function<void (word_tp *)> deleter_fn;
 typedef std::unique_ptr<word_tp, deleter_fn> word_ptr;

 /**
 * Given a char pointer ptr pointing to nbytes bytes, return a
 * std::unique_ptr to uint64_t containing those same byte values
 * (using the host's byte order, i.e. no byte reordering is done).
 *
 * The main purpose of this function is to fix pointer alignment
 * issues when converting from a char pointer to a uint64 pointer; the
 * issue being that a uint64 pointer has stricter alignment
 * requirements than a char pointer.
 *
 * We assume that releasing the memory associated with ptr is someone
 * else's problem. So, if ptr is already correctly aligned, we just
 * cast it to uint64 and return a unique_ptr whose deleter is
 * empty. If ptr is misaligned, then we copy the nbytes at ptr to a
 * location that is correctly aligned and then return a unique_ptr
 * whose deleter will delete that allocated space when the unique_ptr
 * goes out of scope.
 *
 * NB: ASSUMES nbytes is divisible by WORD_BYTES.
 *
 * TODO: On some architectures it might be advantageous to have
 * 16-byte aligned memory, even when the words used are only 8 bytes
 * (this is to do with cache line size). This function could be easily
 * modified to allow experimentation with different alignments.
 */
 static word_ptr
 adjust_ptr_alignment(const char *ptr, size_t nbytes) {
    static constexpr size_t wordbytes = sizeof(word_tp);
    size_t nwords = nbytes / wordbytes;
    uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
    // ptr is correctly aligned if addr is divisible by wordbytes
    if (addr % wordbytes != 0) {
        // ptr is NOT correctly aligned

        // copy_words has correct alignment
        uint64_t *copy_words = new uint64_t[nwords];
        // alias copy_words with a byte pointer; the cast safe because
        // uint8_t alignment is less restrictive than uint64_t
        uint8_t *copy_bytes = reinterpret_cast<uint8_t *>(copy_words);
        // copy everything across
        std::copy(ptr, ptr + nbytes, copy_bytes);
        // return a unique_ptr with array delete to delete copy_words
        auto array_delete = [] (word_tp *p) { delete[] p; };
        return word_ptr(copy_words, array_delete);
    } else {
        // ptr IS correctly aligned

        // safe cast because the address of ptr is wordbyte-aligned.
        word_tp *same = reinterpret_cast<word_tp *>(ptr);
        // don't delete backing array since we don't own it
        auto empty_delete = [] (word_tp *) { };
        return word_ptr(same, empty_delete);
    }
 }


 extern "C"
 {
    /**
     * Calculate population counts of an array of inputs; return how
     * long it took in milliseconds.
     *
     * 'arrays' must point to narrays*array_bytes bytes
     * 'counts' must point to narrays*sizeof(uint32_t) bytes.
     * For i = 0 to n - 1, the population count of the array_bytes*8 bits
     *
     *   arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
     *
     * is put in counts[i].
     *
     * ASSUMES: array_bytes is divisible by 8.
     */
    double
    popcount_arrays(
            uint32_t *counts,
            const char *arrays, int narrays, int array_bytes) {
        // assumes WORD_BYTES divides array_bytes
        int nwords = array_bytes / WORD_BYTES;
        // The static_cast is to avoid int overflow in the multiplication
        size_t total_bytes = static_cast<size_t>(narrays) * array_bytes;
        auto ptr = adjust_ptr_alignment(arrays, total_bytes);
        auto u = ptr.get();

        clock_t t = clock();
        switch (nwords) {
        case 32: _popcount_arrays<32>(counts, u, narrays); break;
        case 16: _popcount_arrays<16>(counts, u, narrays); break;
        case  8: _popcount_arrays< 8>(counts, u, narrays); break;
        default:
            for (int i = 0; i < narrays; ++i, u += nwords)
                counts[i] = _popcount_array(u, nwords);
        }
        return to_millis(clock() - t);
    }
 }
diff --git a/Julia Popcount.ipynb b/Julia Popcount.ipynb
	#include <memory>
	#include <functional>
	#include <algorithm>
	#include <vector>
	#include <queue>
	#include <stdint.h>
	#include <cstdlib>
	#include <ctime>
	#include <cassert>
	#include <climits>

	static constexpr int WORD_BYTES = sizeof(uint64_t);

	/**
	* The popcount of n elements of buf is the sum of c0, c1, c2, c3.
	*/
	template<int n>
	static inline void
	popcount(
	uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
	const uint64_t *buf) {
	popcount<4>(c0, c1, c2, c3, buf);
	popcount<n - 4>(c0, c1, c2, c3, buf + 4);
	}

	// Fast Path
	//
	// Source: http://danluu.com/assembly-intrinsics/
	// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
	//
	// NB: Dan Luu's original assembly (and the SO answer it was based on)
	// is incorrect because it clobbers registers marked as "input only"
	// (see warning at
	// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
	// -- this mistake does not materialise with GCC (4.9), but it does
	// with Clang (3.6 and 3.8)). We fix the mistake by explicitly
	// loading the contents of buf into registers and using these same
	// registers for the intermediate popcnts.
	/**
	* The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
	*/
	template<>
	inline void
	popcount<4>(
	uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
	const uint64_t *buf) {
	uint64_t b0, b1, b2, b3;
	b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
	__asm__(
	"popcnt %4, %4 \n\t"
	"add %4, %0 \n\t"
	"popcnt %5, %5 \n\t"
	"add %5, %1 \n\t"
	"popcnt %6, %6 \n\t"
	"add %6, %2 \n\t"
	"popcnt %7, %7 \n\t"
	"add %7, %3 \n\t"
	: "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
	"+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
	}

	// Slow paths
	// TODO: Assumes sizeof(long) == 8
	//
	// NB: The specialisation to n=3 is not currently used but included
	// for completeness (i.e. so that popcount<n> is defined for all
	// non-negative n) and in anticipation of its use in the near future.
	#if 0
	template<>
	inline void
	popcount<3>(
	uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
	const uint64_t *buf) {
	c0 += __builtin_popcountl(buf[0]);
	c1 += __builtin_popcountl(buf[1]);
	c2 += __builtin_popcountl(buf[2]);
	}
	#endif

	/**
	* The popcount of 2 elements of buf is the sum of c0, c1.
	*/
	template<>
	inline void
	popcount<2>(
	uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
	const uint64_t *buf) {
	c0 += __builtin_popcountl(buf[0]);
	c1 += __builtin_popcountl(buf[1]);
	}

	/**
	* The popcount *buf is in c0.
	*/
	template<>
	inline void
	popcount<1>(
	uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
	const uint64_t *buf) {
	c0 += __builtin_popcountl(buf[0]);
	}


	/**
	* Calculate population counts of an array of inputs of nwords elements.
	*
	* 'arrays' must point to narraysnwordsWORD_BYTES bytes
	* 'counts' must point to narrays*sizeof(uint32_t) bytes.
	* For i = 0 to narrays - 1, the population count of the nwords elements
	*
	* arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
	*
	* is put in counts[i].
	*/
	template<int nwords>
	static void
	_popcount_arrays(uint32_t counts, const uint64_t arrays, int narrays) {
	uint64_t c0, c1, c2, c3;
	for (int i = 0; i < narrays; ++i, arrays += nwords) {
	c0 = c1 = c2 = c3 = 0;
	popcount<nwords>(c0, c1, c2, c3, arrays);
	counts[i] = c0 + c1 + c2 + c3;
	}
	}

	/**
	* Return the popcount of the nwords elements starting at array.
	*/
	static uint32_t
	_popcount_array(const uint64_t *array, int nwords) {
	uint64_t c0, c1, c2, c3;
	c0 = c1 = c2 = c3 = 0;

	while (nwords >= 16) {
	popcount<16>(c0, c1, c2, c3, array);
	array += 16;
	nwords -= 16;
	}
	// nwords < 16
	if (nwords >= 8) {
	popcount<8>(c0, c1, c2, c3, array);
	array += 8;
	nwords -= 8;
	}
	// nwords < 8
	if (nwords >= 4) {
	popcount<4>(c0, c1, c2, c3, array);
	array += 4;
	nwords -= 4;
	}
	// nwords < 4
	if (nwords >= 2) {
	popcount<2>(c0, c1, c2, c3, array);
	array += 2;
	nwords -= 2;
	}
	// nwords < 2
	if (nwords == 1)
	popcount<1>(c0, c1, c2, c3, array);
	return c0 + c1 + c2 + c3;
	}

	/**
	* Convert clock measurement t to milliseconds.
	*
	* t should have been obtained as the difference of calls to clock()
	* for this to make sense.
	*/
	static inline double
	to_millis(clock_t t) {
	static constexpr double CPS = (double)CLOCKS_PER_SEC;
	return t * 1.0E3 / CPS;
	}

	static inline uint32_t
	abs_diff(uint32_t a, uint32_t b) {
	if (a > b)
	return a - b;
	return b - a;
	}


	typedef const uint64_t word_tp;
	typedef std::function<void (word_tp *)> deleter_fn;
	typedef std::unique_ptr<word_tp, deleter_fn> word_ptr;

	/**
	* Given a char pointer ptr pointing to nbytes bytes, return a
	* std::unique_ptr to uint64_t containing those same byte values
	* (using the host's byte order, i.e. no byte reordering is done).
	*
	* The main purpose of this function is to fix pointer alignment
	* issues when converting from a char pointer to a uint64 pointer; the
	* issue being that a uint64 pointer has stricter alignment
	* requirements than a char pointer.
	*
	* We assume that releasing the memory associated with ptr is someone
	* else's problem. So, if ptr is already correctly aligned, we just
	* cast it to uint64 and return a unique_ptr whose deleter is
	* empty. If ptr is misaligned, then we copy the nbytes at ptr to a
	* location that is correctly aligned and then return a unique_ptr
	* whose deleter will delete that allocated space when the unique_ptr
	* goes out of scope.
	*
	* NB: ASSUMES nbytes is divisible by WORD_BYTES.
	*
	* TODO: On some architectures it might be advantageous to have
	* 16-byte aligned memory, even when the words used are only 8 bytes
	* (this is to do with cache line size). This function could be easily
	* modified to allow experimentation with different alignments.
	*/
	static word_ptr
	adjust_ptr_alignment(const char *ptr, size_t nbytes) {
	static constexpr size_t wordbytes = sizeof(word_tp);
	size_t nwords = nbytes / wordbytes;
	uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
	// ptr is correctly aligned if addr is divisible by wordbytes
	if (addr % wordbytes != 0) {
	// ptr is NOT correctly aligned

	// copy_words has correct alignment
	uint64_t *copy_words = new uint64_t[nwords];
	// alias copy_words with a byte pointer; the cast safe because
	// uint8_t alignment is less restrictive than uint64_t
	uint8_t copy_bytes = reinterpret_cast<uint8_t >(copy_words);
	// copy everything across
	std::copy(ptr, ptr + nbytes, copy_bytes);
	// return a unique_ptr with array delete to delete copy_words
	auto array_delete = [] (word_tp *p) { delete[] p; };
	return word_ptr(copy_words, array_delete);
	} else {
	// ptr IS correctly aligned

	// safe cast because the address of ptr is wordbyte-aligned.
	word_tp same = reinterpret_cast<word_tp >(ptr);
	// don't delete backing array since we don't own it
	auto empty_delete = [] (word_tp *) { };
	return word_ptr(same, empty_delete);
	}
	}


	extern "C"
	{
	/**
	* Calculate population counts of an array of inputs; return how
	* long it took in milliseconds.
	*
	* 'arrays' must point to narrays*array_bytes bytes
	* 'counts' must point to narrays*sizeof(uint32_t) bytes.
	* For i = 0 to n - 1, the population count of the array_bytes*8 bits
	*
	* arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
	*
	* is put in counts[i].
	*
	* ASSUMES: array_bytes is divisible by 8.
	*/
	double
	popcount_arrays(
	uint32_t *counts,
	const char *arrays, int narrays, int array_bytes) {
	// assumes WORD_BYTES divides array_bytes
	int nwords = array_bytes / WORD_BYTES;
	// The static_cast is to avoid int overflow in the multiplication
	size_t total_bytes = static_cast<size_t>(narrays) * array_bytes;
	auto ptr = adjust_ptr_alignment(arrays, total_bytes);
	auto u = ptr.get();

	clock_t t = clock();
	switch (nwords) {
	case 32: _popcount_arrays<32>(counts, u, narrays); break;
	case 16: _popcount_arrays<16>(counts, u, narrays); break;
	case 8: _popcount_arrays< 8>(counts, u, narrays); break;
	default:
	for (int i = 0; i < narrays; ++i, u += nwords)
	counts[i] = _popcount_array(u, nwords);
	}
	return to_millis(clock() - t);
	}
	}