I have been working on a simple precision-doubling template.
The largest integer supported by GCC and Clang are the __int128
and unsigned __int128
types.
In general, there is no way to access the carry flag necessary to implement multiple precision additions. Fortunately Clang detects that's what one is doing and generate good assembler, at least in AMD64 (x86-64):
#include <stdint.h>
using uint128_t = unsigned __int128;
struct u256 {
uint128_t low, high;
constexpr u256 operator+(uint128_t arg) const {
u256 rv = { low + arg, high };
if(rv.low < low) { ++rv.high; }
return rv;
}
};
u256 plus(u256 big, uint128_t small) {
return big + small;
}
plus
gets translated to:
plus(u256, unsigned __int128): # @plus(u256, unsigned __int128)
add rsi, qword ptr [rsp + 8]
adc rdx, qword ptr [rsp + 16]
mov qword ptr [rdi], rsi
mov qword ptr [rdi + 8], rdx
mov rax, qword ptr [rsp + 24]
mov rcx, qword ptr [rsp + 32]
adc rax, 0
adc rcx, 0
mov qword ptr [rdi + 16], rax
mov qword ptr [rdi + 24], rcx
mov rax, rdi
ret
Without any conditionals. Unfortunately, I don't know how to get GCC to understand I am after add-with-carry:
plus(u256, unsigned __int128):
push rbx
mov r9, rsi
mov rsi, QWORD PTR [rsp+16]
mov rax, rdi
mov rdi, QWORD PTR [rsp+24]
mov r10, rdx
mov rcx, QWORD PTR [rsp+32]
mov rbx, QWORD PTR [rsp+40]
add r9, rsi
adc r10, rdi
mov QWORD PTR [rax], r9
cmp rdi, r10
mov QWORD PTR [rax+8], r10
ja .L2
jnb .L9
.L7:
mov QWORD PTR [rax+24], rbx
mov QWORD PTR [rax+16], rcx
pop rbx
ret
.L9:
cmp rsi, r9
jbe .L7
.L2:
add rcx, 1
adc rbx, 0
mov QWORD PTR [rax+16], rcx
mov QWORD PTR [rax+24], rbx
pop rbx
ret
With regards to shifting, to simplify the explanation, let us work with pairs of uint64_t
:
struct u128 {
uint64_t low, high;
constexpr u128 shiftRight(unsigned arg) const {
if(64 <= arg) {
auto adjusted = arg - 64;
return { high >> adjusted, 0 };
}
u128 rv = { low >> arg, high >> arg };
auto carriedMask = (uint64_t(1) << arg) - 1;
auto carryDown = (high & carriedMask) << (64 - arg);
rv.low |= carryDown;
return rv;
}
};
u256 shiftRight(u256 value, unsigned shift) {
return value.shiftRight(shift);
}
Clang again "understands" what the code is trying to do, this time that the lower bits of high
are carried down to the upper bits of low
:
shiftRight(u128, unsigned int): # @shiftRight(u128, unsigned int)
mov ecx, edx
cmp ecx, 64
jb .LBB1_2
add ecx, -64
shr rsi, cl
xor edx, edx
mov rax, rsi
ret
.LBB1_2:
mov rdx, rsi
shr rdx, cl
mov eax, 1
shl rax, cl
dec rax
and rax, rsi
shrd rdi, rax, cl
mov rsi, rdi
mov rax, rsi
ret
That is, Clang uses the three-argument version of SHR
to "spill down" the bits from rdi
into rax
.
But not GCC:
shiftRight(u128, unsigned int):
cmp edx, 63
jbe .L11
lea ecx, [rdx-64]
mov QWORD PTR [rsp-16], 0
shr rsi, cl
mov QWORD PTR [rsp-40], rsi
.L12:
movq xmm0, QWORD PTR [rsp-40]
movhps xmm0, QWORD PTR [rsp-16]
movaps XMMWORD PTR [rsp-40], xmm0
mov rax, QWORD PTR [rsp-40]
mov rdx, QWORD PTR [rsp-32]
ret
.L11:
mov ecx, edx
mov rax, rsi
shr rax, cl
mov QWORD PTR [rsp-16], rax
mov rax, -1
sal rax, cl
mov ecx, 64
not rax
sub ecx, edx
and rsi, rax
sal rsi, cl
mov ecx, edx
shr rdi, cl
or rsi, rdi
mov QWORD PTR [rsp-40], rsi
jmp .L12
Perhaps there is an extension to get GCC to "understand" it should use carry or the three-operand version of SHR.