ridiculousfish · January 28, 2023 20:03
diff --git a/benchmark_results.txt b/benchmark_results.txt
 # Benchmark results, lower is better
 # Apple M1 Max
 > clang++ -std=c++14 -O3 divlu_benchmark.cpp; ./a.out
           hackers	11.5198
        libdiv org	8.9480 # without branch
        libdiv brn	9.0656 # with branch
 
 
 # Ryzen 9 5900x
 > clang++ -O3 divlu_benchmark.cpp; ./a.out
           hackers	10.8785
        libdiv org	9.2789 # without branch
        libdiv brn	9.9835 # with branch
              divq	2.5347
              
 > g++ -O3 divlu_benchmark.cpp; ./a.out
           hackers	14.5077
        libdiv org	9.0915 # without branch
        libdiv brn	9.5235 # with branch
              divq	2.5241
              
 > g++ -O3 -march=native divlu_benchmark.cpp; ./a.out
           hackers	9.7027
        libdiv org	7.8284  # without branch
        libdiv brn	8.7095  # with branch
              divq	2.5382

diff --git a/divlu_benchmark.cpp b/divlu_benchmark.cpp
 #include <limits.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>

 #include <chrono>
 #include <random>

 /*
 * Perform a narrowing division: 128 / 64 -> 64, and 64 / 32 -> 32.
 * The dividend's low and high words are given by \p numhi and \p numlo, respectively.
 * The divisor is given by \p den.
 * \return the quotient, and the remainder by reference in \p r, if not null.
 * If the quotient would require more than 64 bits, or if denom is 0, then return the max value
 * for both quotient and remainder.
 *
 * These functions are released into the public domain, where applicable, or the CC0 license.
 */
 __attribute__((noinline)) uint64_t divllu_branch(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    // We work in base 2**32.
    // A uint32 holds a single digit. A uint64 holds two digits.
    // Our numerator is conceptually [num3, num2, num1, num0].
    // Our denominator is [den1, den0].
    const uint64_t b = (1ull << 32);

    // The high and low digits of our computed quotient.
    uint32_t q1;
    uint32_t q0;

    // The normalization shift factor.
    int shift;

    // The high and low digits of our denominator (after normalizing).
    // Also the low 2 digits of our numerator (after normalizing).
    uint32_t den1;
    uint32_t den0;
    uint32_t num1;
    uint32_t num0;

    // A partial remainder.
    uint64_t rem;

    // The estimated quotient, and its corresponding remainder.
    uint64_t qhat;
    uint64_t rhat;

    // Variables used to correct the quotient and remainder.
    uint64_t remd1;
    uint64_t remd0;
    uint32_t qcorr;

    // Check for overflow and divide by 0.
    if (numhi >= den) {
        if (r != NULL) *r = ~0ull;
        return ~0ull;
    }

    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
    // least half b. In binary this means just shifting left by the number of leading zeros, so that
    // there's a 1 in the MSB.
    // We also shift numer by the same amount. This cannot overflow because numhi < den.
    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
    shift = __builtin_clzll(den);
    if (shift > 0) {
        den <<= shift;
        numhi <<= shift;
        numhi |= (numlo >> (64 - shift));
        numlo <<= shift;
    }

    // Extract the low digits of the numerator and both digits of the denominator.
    num1 = (uint32_t)(numlo >> 32);
    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
    den1 = (uint32_t)(den >> 32);
    den0 = (uint32_t)(den & 0xFFFFFFFFu);

    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
    // Estimate q1 as [n3 n2] / [d1], and then correct it.
    // Note while qhat may be 2 digits, q1 is always 1 digit.
    qhat = numhi / den1;
    rhat = numhi % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num1;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    // Correct both qhat and remainder.
    if (remd0 > remd1) {
        qcorr = (remd0 - remd1 > den);
        qhat -= (qcorr + 1);
        rem += den << qcorr;
    }
    q1 = (uint32_t)qhat;

    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
    qhat = rem / den1;
    rhat = rem % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num0;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    if (remd0 > remd1) {
        qcorr = (remd0 - remd1 > den);
        qhat -= (qcorr + 1);
        rem += den << qcorr;
    }
    q0 = (uint32_t)qhat;

    // Return remainder if requested.
    if (r != NULL) *r = rem >> shift;
    return ((uint64_t)q1 << 32) | q0;
 }

 __attribute__((noinline)) uint64_t divllu_mine(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    // We work in base 2**32.
    // A uint32 holds a single digit. A uint64 holds two digits.
    // Our numerator is conceptually [num3, num2, num1, num0].
    // Our denominator is [den1, den0].
    const uint64_t b = (1ull << 32);

    // The high and low digits of our computed quotient.
    uint32_t q1;
    uint32_t q0;

    // The normalization shift factor.
    int shift;

    // The high and low digits of our denominator (after normalizing).
    // Also the low 2 digits of our numerator (after normalizing).
    uint32_t den1;
    uint32_t den0;
    uint32_t num1;
    uint32_t num0;

    // A partial remainder.
    uint64_t rem;

    // The estimated quotient, and its corresponding remainder.
    uint64_t qhat;
    uint64_t rhat;

    // Variables used to correct the quotient and remainder.
    uint64_t remd1;
    uint64_t remd0;
    uint32_t qcorr;

    // Check for overflow and divide by 0.
    if (numhi >= den) {
        if (r != NULL) *r = ~0ull;
        return ~0ull;
    }

    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
    // least half b. In binary this means just shifting left by the number of leading zeros, so that
    // there's a 1 in the MSB.
    // We also shift numer by the same amount. This cannot overflow because numhi < den.
    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
    shift = __builtin_clzll(den);
    den <<= shift;
    numhi <<= shift;
    numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
    numlo <<= shift;

    // Extract the low digits of the numerator and both digits of the denominator.
    num1 = (uint32_t)(numlo >> 32);
    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
    den1 = (uint32_t)(den >> 32);
    den0 = (uint32_t)(den & 0xFFFFFFFFu);

    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
    // Estimate q1 as [n3 n2] / [d1], and then correct it.
    // Note while qhat may be 2 digits, q1 is always 1 digit.
    qhat = numhi / den1;
    rhat = numhi % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num1;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    // Correct both qhat and remainder.
    if (remd0 > remd1) {
        qcorr = (remd0 - remd1 > den);
        qhat -= (qcorr + 1);
        rem += den << qcorr;
    }
    q1 = (uint32_t)qhat;

    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
    qhat = rem / den1;
    rhat = rem % den1;

    // Estimate the true remainder.
    remd1 = rhat * b + num0;
    remd0 = qhat * den0;
    rem = remd1 - remd0;

    if (remd0 > remd1) {
        qcorr = (remd0 - remd1 > den);
        qhat -= (qcorr + 1);
        rem += den << qcorr;
    }
    q0 = (uint32_t)qhat;

    // Return remainder if requested.
    if (r != NULL) *r = rem >> shift;
    return ((uint64_t)q1 << 32) | q0;
 }

 __attribute__((noinline)) uint64_t divllu_orig(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
    const uint64_t b = (1ULL << 32);  // Number base (16 bits).
    uint64_t un1, un0,                // Norm. dividend LSD's.
        vn1, vn0,                     // Norm. divisor digits.
        q1, q0,                       // Quotient digits.
        un64, un21, un10,             // Dividend digit pairs.
        rhat;                         // A remainder.
    int s;                            // Shift amount for norm.

    if (u1 >= v) {                // If overflow, set rem.
        if (r != NULL)            // to an impossible value,
            *r = (uint64_t)(-1);  // and return the largest
        return (uint64_t)(-1);
    }  // possible quotient.

    /* count leading zeros */
    s = __builtin_clzll(v);  // 0 <= s <= 63.

    v = v << s;            // Normalize divisor.
    vn1 = v >> 32;         // Break divisor up into
    vn0 = v & 0xFFFFFFFF;  // two 32-bit digits.

    un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
    un10 = u0 << s;  // Shift dividend left.

    un1 = un10 >> 32;         // Break right half of
    un0 = un10 & 0xFFFFFFFF;  // dividend into two digits.

    q1 = un64 / vn1;         // Compute the first
    rhat = un64 - q1 * vn1;  // quotient digit, q1.
 again1:
    if (q1 >= b || q1 * vn0 > b * rhat + un1) {
        q1 = q1 - 1;
        rhat = rhat + vn1;
        if (rhat < b) goto again1;
    }

    un21 = un64 * b + un1 - q1 * v;  // Multiply and subtract.

    q0 = un21 / vn1;         // Compute the second
    rhat = un21 - q0 * vn1;  // quotient digit, q0.
 again2:
    if (q0 >= b || q0 * vn0 > b * rhat + un0) {
        q0 = q0 - 1;
        rhat = rhat + vn1;
        if (rhat < b) goto again2;
    }

    if (r != NULL)                            // If remainder is wanted,
        *r = (un21 * b + un0 - q0 * v) >> s;  // return it.
    return q1 * b + q0;
 }

 #if defined(__x86_64__)
 __attribute__((noinline)) uint64_t divllu_asm(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    uint64_t result;
    __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
    return result;
 }
 #endif

 constexpr size_t CASE_COUNT = 1 << 16;
 constexpr size_t ITER_COUNT = 1 << 11;
 struct case_t {
    uint64_t numerhi;
    uint64_t numerlo;
    uint64_t denom;

    bool valid() const { return denom > 0 && numerhi < denom; }
 };

 static const case_t *make_cases() {
    std::mt19937_64 mt(std::mt19937_64::default_seed);
    case_t *cases = new case_t[CASE_COUNT];
    for (size_t i = 0; i < CASE_COUNT; i++) {
        do {
            cases[i].numerlo = mt();
            cases[i].numerhi = mt();
            cases[i].denom = mt();
        } while (!cases[i].valid());
    }
    return cases;
 }

 using divider_t = uint64_t (*)(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r);

 __attribute__((noinline)) static uint64_t time_function(const case_t *cases, divider_t div) {
    uint64_t sum = 0;
    for (size_t i = 0; i < CASE_COUNT; i++) {
        uint64_t quot;
        uint64_t rem;
        quot = div(cases[i].numerhi, cases[i].numerlo, cases[i].denom, &rem);
        sum += quot;
        sum += rem;
    }
    return sum;
 }

 int main(int argc, char *argv[]) {
    int onlycase = 0;
    if (argv[1]) onlycase = atoi(argv[1]);
    const struct {
        const char *name;
        divider_t func;
    } dividers[] = {
        {"hackers", divllu_orig},
        {"libdiv org", divllu_mine},
        {"libdiv brn", divllu_branch},
 #if defined(__x86_64__)
        {"divq", divllu_asm},
 #endif
    };
    const case_t *cases = make_cases();
    using namespace std::chrono;
    const uint64_t exp = time_function(cases, divllu_orig);
    int whichcase = 0;
    for (const auto &d : dividers) {
        whichcase++;
        if (onlycase && whichcase != onlycase) continue;
        uint64_t best = std::numeric_limits<uint64_t>::max();
        for (size_t i = 0; i < ITER_COUNT; i++) {
            using namespace std::chrono;
            auto t1 = high_resolution_clock::now();
            uint64_t res = time_function(cases, d.func);
            auto t2 = high_resolution_clock::now();
            uint64_t nanos = duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
            best = std::min(best, nanos);
            if (res != exp) abort();
        }
        double nsec = (double)best / (double)CASE_COUNT;
        printf("%18s\t%4.4f\n", d.name, nsec);
    }
    return 0;
 }
	# Benchmark results, lower is better
	# Apple M1 Max
	> clang++ -std=c++14 -O3 divlu_benchmark.cpp; ./a.out
	hackers 11.5198
	libdiv org 8.9480 # without branch
	libdiv brn 9.0656 # with branch


	# Ryzen 9 5900x
	> clang++ -O3 divlu_benchmark.cpp; ./a.out
	hackers 10.8785
	libdiv org 9.2789 # without branch
	libdiv brn 9.9835 # with branch
	divq 2.5347

	> g++ -O3 divlu_benchmark.cpp; ./a.out
	hackers 14.5077
	libdiv org 9.0915 # without branch
	libdiv brn 9.5235 # with branch
	divq 2.5241

	> g++ -O3 -march=native divlu_benchmark.cpp; ./a.out
	hackers 9.7027
	libdiv org 7.8284 # without branch
	libdiv brn 8.7095 # with branch
	divq 2.5382
	#include <limits.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>

	#include <chrono>
	#include <random>

	/*
	* Perform a narrowing division: 128 / 64 -> 64, and 64 / 32 -> 32.
	* The dividend's low and high words are given by \p numhi and \p numlo, respectively.
	* The divisor is given by \p den.
	* \return the quotient, and the remainder by reference in \p r, if not null.
	* If the quotient would require more than 64 bits, or if denom is 0, then return the max value
	* for both quotient and remainder.
	*
	* These functions are released into the public domain, where applicable, or the CC0 license.
	*/
	__attribute__((noinline)) uint64_t divllu_branch(
	uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
	// We work in base 2**32.
	// A uint32 holds a single digit. A uint64 holds two digits.
	// Our numerator is conceptually [num3, num2, num1, num0].
	// Our denominator is [den1, den0].
	const uint64_t b = (1ull << 32);

	// The high and low digits of our computed quotient.
	uint32_t q1;
	uint32_t q0;

	// The normalization shift factor.
	int shift;

	// The high and low digits of our denominator (after normalizing).
	// Also the low 2 digits of our numerator (after normalizing).
	uint32_t den1;
	uint32_t den0;
	uint32_t num1;
	uint32_t num0;

	// A partial remainder.
	uint64_t rem;

	// The estimated quotient, and its corresponding remainder.
	uint64_t qhat;
	uint64_t rhat;

	// Variables used to correct the quotient and remainder.
	uint64_t remd1;
	uint64_t remd0;
	uint32_t qcorr;

	// Check for overflow and divide by 0.
	if (numhi >= den) {
	if (r != NULL) *r = ~0ull;
	return ~0ull;
	}

	// Determine the normalization factor. We multiply den by this, so that its leading digit is at
	// least half b. In binary this means just shifting left by the number of leading zeros, so that
	// there's a 1 in the MSB.
	// We also shift numer by the same amount. This cannot overflow because numhi < den.
	// The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
	// by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
	// 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
	shift = __builtin_clzll(den);
	if (shift > 0) {
	den <<= shift;
	numhi <<= shift;
	numhi \|= (numlo >> (64 - shift));
	numlo <<= shift;
	}

	// Extract the low digits of the numerator and both digits of the denominator.
	num1 = (uint32_t)(numlo >> 32);
	num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
	den1 = (uint32_t)(den >> 32);
	den0 = (uint32_t)(den & 0xFFFFFFFFu);

	// We wish to compute q1 = [n3 n2 n1] / [d1 d0].
	// Estimate q1 as [n3 n2] / [d1], and then correct it.
	// Note while qhat may be 2 digits, q1 is always 1 digit.
	qhat = numhi / den1;
	rhat = numhi % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num1;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	// Correct both qhat and remainder.
	if (remd0 > remd1) {
	qcorr = (remd0 - remd1 > den);
	qhat -= (qcorr + 1);
	rem += den << qcorr;
	}
	q1 = (uint32_t)qhat;

	// We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
	// Estimate q0 as [rem1 rem0] / [d1] and correct it.
	qhat = rem / den1;
	rhat = rem % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num0;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	if (remd0 > remd1) {
	qcorr = (remd0 - remd1 > den);
	qhat -= (qcorr + 1);
	rem += den << qcorr;
	}
	q0 = (uint32_t)qhat;

	// Return remainder if requested.
	if (r != NULL) *r = rem >> shift;
	return ((uint64_t)q1 << 32) \| q0;
	}

	__attribute__((noinline)) uint64_t divllu_mine(
	uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
	// We work in base 2**32.
	// A uint32 holds a single digit. A uint64 holds two digits.
	// Our numerator is conceptually [num3, num2, num1, num0].
	// Our denominator is [den1, den0].
	const uint64_t b = (1ull << 32);

	// The high and low digits of our computed quotient.
	uint32_t q1;
	uint32_t q0;

	// The normalization shift factor.
	int shift;

	// The high and low digits of our denominator (after normalizing).
	// Also the low 2 digits of our numerator (after normalizing).
	uint32_t den1;
	uint32_t den0;
	uint32_t num1;
	uint32_t num0;

	// A partial remainder.
	uint64_t rem;

	// The estimated quotient, and its corresponding remainder.
	uint64_t qhat;
	uint64_t rhat;

	// Variables used to correct the quotient and remainder.
	uint64_t remd1;
	uint64_t remd0;
	uint32_t qcorr;

	// Check for overflow and divide by 0.
	if (numhi >= den) {
	if (r != NULL) *r = ~0ull;
	return ~0ull;
	}

	// Determine the normalization factor. We multiply den by this, so that its leading digit is at
	// least half b. In binary this means just shifting left by the number of leading zeros, so that
	// there's a 1 in the MSB.
	// We also shift numer by the same amount. This cannot overflow because numhi < den.
	// The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
	// by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
	// 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
	shift = __builtin_clzll(den);
	den <<= shift;
	numhi <<= shift;
	numhi \|= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
	numlo <<= shift;

	// Extract the low digits of the numerator and both digits of the denominator.
	num1 = (uint32_t)(numlo >> 32);
	num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
	den1 = (uint32_t)(den >> 32);
	den0 = (uint32_t)(den & 0xFFFFFFFFu);

	// We wish to compute q1 = [n3 n2 n1] / [d1 d0].
	// Estimate q1 as [n3 n2] / [d1], and then correct it.
	// Note while qhat may be 2 digits, q1 is always 1 digit.
	qhat = numhi / den1;
	rhat = numhi % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num1;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	// Correct both qhat and remainder.
	if (remd0 > remd1) {
	qcorr = (remd0 - remd1 > den);
	qhat -= (qcorr + 1);
	rem += den << qcorr;
	}
	q1 = (uint32_t)qhat;

	// We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
	// Estimate q0 as [rem1 rem0] / [d1] and correct it.
	qhat = rem / den1;
	rhat = rem % den1;

	// Estimate the true remainder.
	remd1 = rhat * b + num0;
	remd0 = qhat * den0;
	rem = remd1 - remd0;

	if (remd0 > remd1) {
	qcorr = (remd0 - remd1 > den);
	qhat -= (qcorr + 1);
	rem += den << qcorr;
	}
	q0 = (uint32_t)qhat;

	// Return remainder if requested.
	if (r != NULL) *r = rem >> shift;
	return ((uint64_t)q1 << 32) \| q0;
	}

	__attribute__((noinline)) uint64_t divllu_orig(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
	const uint64_t b = (1ULL << 32); // Number base (16 bits).
	uint64_t un1, un0, // Norm. dividend LSD's.
	vn1, vn0, // Norm. divisor digits.
	q1, q0, // Quotient digits.
	un64, un21, un10, // Dividend digit pairs.
	rhat; // A remainder.
	int s; // Shift amount for norm.

	if (u1 >= v) { // If overflow, set rem.
	if (r != NULL) // to an impossible value,
	*r = (uint64_t)(-1); // and return the largest
	return (uint64_t)(-1);
	} // possible quotient.

	/* count leading zeros */
	s = __builtin_clzll(v); // 0 <= s <= 63.

	v = v << s; // Normalize divisor.
	vn1 = v >> 32; // Break divisor up into
	vn0 = v & 0xFFFFFFFF; // two 32-bit digits.

	un64 = (u1 << s) \| ((u0 >> (64 - s)) & (-s >> 31));
	un10 = u0 << s; // Shift dividend left.

	un1 = un10 >> 32; // Break right half of
	un0 = un10 & 0xFFFFFFFF; // dividend into two digits.

	q1 = un64 / vn1; // Compute the first
	rhat = un64 - q1 * vn1; // quotient digit, q1.
	again1:
	if (q1 >= b \|\| q1 * vn0 > b * rhat + un1) {
	q1 = q1 - 1;
	rhat = rhat + vn1;
	if (rhat < b) goto again1;
	}

	un21 = un64 * b + un1 - q1 * v; // Multiply and subtract.

	q0 = un21 / vn1; // Compute the second
	rhat = un21 - q0 * vn1; // quotient digit, q0.
	again2:
	if (q0 >= b \|\| q0 * vn0 > b * rhat + un0) {
	q0 = q0 - 1;
	rhat = rhat + vn1;
	if (rhat < b) goto again2;
	}

	if (r != NULL) // If remainder is wanted,
	r = (un21 b + un0 - q0 * v) >> s; // return it.
	return q1 * b + q0;
	}

	#if defined(__x86_64__)
	__attribute__((noinline)) uint64_t divllu_asm(
	uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
	uint64_t result;
	__asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
	return result;
	}
	#endif

	constexpr size_t CASE_COUNT = 1 << 16;
	constexpr size_t ITER_COUNT = 1 << 11;
	struct case_t {
	uint64_t numerhi;
	uint64_t numerlo;
	uint64_t denom;

	bool valid() const { return denom > 0 && numerhi < denom; }
	};

	static const case_t *make_cases() {
	std::mt19937_64 mt(std::mt19937_64::default_seed);
	case_t *cases = new case_t[CASE_COUNT];
	for (size_t i = 0; i < CASE_COUNT; i++) {
	do {
	cases[i].numerlo = mt();
	cases[i].numerhi = mt();
	cases[i].denom = mt();
	} while (!cases[i].valid());
	}
	return cases;
	}

	using divider_t = uint64_t ()(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t r);

	__attribute__((noinline)) static uint64_t time_function(const case_t *cases, divider_t div) {
	uint64_t sum = 0;
	for (size_t i = 0; i < CASE_COUNT; i++) {
	uint64_t quot;
	uint64_t rem;
	quot = div(cases[i].numerhi, cases[i].numerlo, cases[i].denom, &rem);
	sum += quot;
	sum += rem;
	}
	return sum;
	}

	int main(int argc, char *argv[]) {
	int onlycase = 0;
	if (argv[1]) onlycase = atoi(argv[1]);
	const struct {
	const char *name;
	divider_t func;
	} dividers[] = {
	{"hackers", divllu_orig},
	{"libdiv org", divllu_mine},
	{"libdiv brn", divllu_branch},
	#if defined(__x86_64__)
	{"divq", divllu_asm},
	#endif
	};
	const case_t *cases = make_cases();
	using namespace std::chrono;
	const uint64_t exp = time_function(cases, divllu_orig);
	int whichcase = 0;
	for (const auto &d : dividers) {
	whichcase++;
	if (onlycase && whichcase != onlycase) continue;
	uint64_t best = std::numeric_limits<uint64_t>::max();
	for (size_t i = 0; i < ITER_COUNT; i++) {
	using namespace std::chrono;
	auto t1 = high_resolution_clock::now();
	uint64_t res = time_function(cases, d.func);
	auto t2 = high_resolution_clock::now();
	uint64_t nanos = duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
	best = std::min(best, nanos);
	if (res != exp) abort();
	}
	double nsec = (double)best / (double)CASE_COUNT;
	printf("%18s\t%4.4f\n", d.name, nsec);
	}
	return 0;
	}