I have been working on a simple precision-doubling template.

The largest integer supported by GCC and Clang are the __int128 and unsigned __int128 types.

Addition

In general, there is no way to access the carry flag necessary to implement multiple precision additions. Fortunately Clang detects that's what one is doing and generate good assembler, at least in AMD64 (x86-64):

#include <stdint.h>

using uint128_t = unsigned __int128;
struct u256 {
    uint128_t low, high;

    constexpr u256 operator+(uint128_t arg) const {
        u256 rv = { low + arg, high };
        if(rv.low < low) { ++rv.high; }
        return rv;
    }
};

u256 plus(u256 big, uint128_t small) {
    return big + small;
}

plus gets translated to:

plus(u256, unsigned __int128):                          # @plus(u256, unsigned __int128)
        add     rsi, qword ptr [rsp + 8]
        adc     rdx, qword ptr [rsp + 16]
        mov     qword ptr [rdi], rsi
        mov     qword ptr [rdi + 8], rdx
        mov     rax, qword ptr [rsp + 24]
        mov     rcx, qword ptr [rsp + 32]
        adc     rax, 0
        adc     rcx, 0
        mov     qword ptr [rdi + 16], rax
        mov     qword ptr [rdi + 24], rcx
        mov     rax, rdi
        ret

Without any conditionals. Unfortunately, I don't know how to get GCC to understand I am after add-with-carry:

plus(u256, unsigned __int128):
        push    rbx
        mov     r9, rsi
        mov     rsi, QWORD PTR [rsp+16]
        mov     rax, rdi
        mov     rdi, QWORD PTR [rsp+24]
        mov     r10, rdx
        mov     rcx, QWORD PTR [rsp+32]
        mov     rbx, QWORD PTR [rsp+40]
        add     r9, rsi
        adc     r10, rdi
        mov     QWORD PTR [rax], r9
        cmp     rdi, r10
        mov     QWORD PTR [rax+8], r10
        ja      .L2
        jnb     .L9
.L7:
        mov     QWORD PTR [rax+24], rbx
        mov     QWORD PTR [rax+16], rcx
        pop     rbx
        ret
.L9:
        cmp     rsi, r9
        jbe     .L7
.L2:
        add     rcx, 1
        adc     rbx, 0
        mov     QWORD PTR [rax+16], rcx
        mov     QWORD PTR [rax+24], rbx
        pop     rbx
        ret

Shift operations

With regards to shifting, to simplify the explanation, let us work with pairs of uint64_t:

struct u128 {
    uint64_t low, high;

    constexpr u128 shiftRight(unsigned arg) const {
        if(64 <= arg) {
            auto adjusted = arg - 64;
            return { high >> adjusted, 0 };
        }
        u128 rv = { low >> arg, high >> arg };
        auto carriedMask = (uint64_t(1) << arg) - 1;
        auto carryDown = (high & carriedMask) << (64 - arg);
        rv.low |= carryDown;
        return rv;
    }
};

u256 shiftRight(u256 value, unsigned shift) {
    return value.shiftRight(shift);
}

Clang again "understands" what the code is trying to do, this time that the lower bits of high are carried down to the upper bits of low:

shiftRight(u128, unsigned int):                   # @shiftRight(u128, unsigned int)
        mov     ecx, edx
        cmp     ecx, 64
        jb      .LBB1_2
        add     ecx, -64
        shr     rsi, cl
        xor     edx, edx
        mov     rax, rsi
        ret
.LBB1_2:
        mov     rdx, rsi
        shr     rdx, cl
        mov     eax, 1
        shl     rax, cl
        dec     rax
        and     rax, rsi
        shrd    rdi, rax, cl
        mov     rsi, rdi
        mov     rax, rsi
        ret

That is, Clang uses the three-argument version of SHR to "spill down" the bits from rdi into rax.

But not GCC:

shiftRight(u128, unsigned int):
        cmp     edx, 63
        jbe     .L11
        lea     ecx, [rdx-64]
        mov     QWORD PTR [rsp-16], 0
        shr     rsi, cl
        mov     QWORD PTR [rsp-40], rsi
.L12:
        movq    xmm0, QWORD PTR [rsp-40]
        movhps  xmm0, QWORD PTR [rsp-16]
        movaps  XMMWORD PTR [rsp-40], xmm0
        mov     rax, QWORD PTR [rsp-40]
        mov     rdx, QWORD PTR [rsp-32]
        ret
.L11:
        mov     ecx, edx
        mov     rax, rsi
        shr     rax, cl
        mov     QWORD PTR [rsp-16], rax
        mov     rax, -1
        sal     rax, cl
        mov     ecx, 64
        not     rax
        sub     ecx, edx
        and     rsi, rax
        sal     rsi, cl
        mov     ecx, edx
        shr     rdi, cl
        or      rsi, rdi
        mov     QWORD PTR [rsp-40], rsi
        jmp     .L12

Perhaps there is an extension to get GCC to "understand" it should use carry or the three-operand version of SHR.

thecppzoo/Multiple-precision-integer-operations-in-Clang-and-GCC.md

Select an option

No results found

Select an option

No results found

Addition

Shift operations