Last active
April 11, 2025 13:56
-
-
Save eira-fransham/80a923edc4a2c9701123677dc5f78ce2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stdfmod: | |
movd esi,xmm0 | |
movd edx,xmm1 | |
mov eax,esi | |
mov ecx,edx | |
and eax,0x7fffffff | |
and ecx,0x7fffffff | |
cmp eax,ecx | |
jae 4011a0 <stdfmod+0x30> | |
cmp ecx,0x7f800000 | |
ja 401198 <stdfmod+0x28> | |
ret | |
nop DWORD PTR [rax+0x0] | |
mulss xmm0,xmm1 | |
ret | |
nop DWORD PTR [rax] | |
mov r9d,eax | |
mov r10d,ecx | |
mov edi,esi | |
shr r10d,0x17 | |
shr r9d,0x17 | |
and edi,0x80000000 | |
mov r8d,r9d | |
lea r11d,[r10-0x18] | |
sub r8d,r10d | |
cmp r11d,0xde | |
ja 401220 <stdfmod+0xb0> | |
cmp r8d,0x8 | |
jg 401390 <stdfmod+0x220> | |
shl ecx,0x8 | |
shl eax,0x8 | |
mov edx,ecx | |
mov ecx,r8d | |
or eax,0x80000000 | |
or edx,0x80000000 | |
shr edx,cl | |
mov ecx,edx | |
xor edx,edx | |
div ecx | |
test edx,edx | |
je 4013c1 <stdfmod+0x251> | |
bsr ecx,edx | |
xor ecx,0x1f | |
shl edx,cl | |
add ecx,0x1 | |
shr edx,0x8 | |
sub r9d,ecx | |
shl r9d,0x17 | |
or edx,edi | |
lea eax,[r9+rdx*1] | |
movd xmm0,eax | |
ret | |
nop DWORD PTR [rax+0x0] | |
test ecx,ecx | |
je 4013e9 <stdfmod+0x279> | |
cmp eax,0x7f7fffff | |
ja 4013e9 <stdfmod+0x279> | |
test r9d,r9d | |
je 401401 <stdfmod+0x291> | |
and esi,0x7fffff | |
mov r11d,0x8 | |
mov eax,esi | |
mov esi,edx | |
lea edx,[r10-0x1] | |
and esi,0x7fffff | |
or eax,0x800000 | |
or esi,0x800000 | |
test r10d,r10d | |
je 40140c <stdfmod+0x29c> | |
push rbp | |
movd xmm0,edi | |
push rbx | |
xor ebx,ebx | |
tzcnt ebx,esi | |
cmp r8d,ebx | |
mov ecx,ebx | |
cmovle ecx,r8d | |
sub r8d,ecx | |
shr esi,cl | |
lea r10d,[rdx+rcx*1] | |
mov ecx,0x8 | |
cmp r8d,ecx | |
cmovle ecx,r8d | |
xor edx,edx | |
shl eax,cl | |
sub r8d,ecx | |
div esi | |
mov r9d,edx | |
test edx,edx | |
je 401373 <stdfmod+0x203> | |
test r8d,r8d | |
je 4013c6 <stdfmod+0x256> | |
mov eax,0xffffffff | |
xor edx,edx | |
add r11d,ebx | |
div esi | |
cmp r11d,r8d | |
jge 401326 <stdfmod+0x1b6> | |
mov ebp,0x20 | |
sub ebp,r11d | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
cs nop WORD PTR [rax+rax*1+0x0] | |
mov edx,r9d | |
mov ebx,r9d | |
mov ecx,r11d | |
sub r8d,r11d | |
imul edx,eax | |
shl ebx,cl | |
mov ecx,ebp | |
shr edx,cl | |
imul edx,esi | |
sub ebx,edx | |
mov r9d,ebx | |
cmp esi,ebx | |
jb 401376 <stdfmod+0x206> | |
cmp r11d,r8d | |
jl 401300 <stdfmod+0x190> | |
mov edx,r9d | |
imul r9d,eax | |
mov ecx,r8d | |
shl edx,cl | |
mov ecx,0x20 | |
sub ecx,r8d | |
mov eax,edx | |
shr r9d,cl | |
imul r9d,esi | |
sub eax,r9d | |
cmp esi,eax | |
jb 401380 <stdfmod+0x210> | |
bsr ecx,eax | |
mov edx,r10d | |
xor ecx,0x1f | |
sub ecx,0x8 | |
shl eax,cl | |
sub edx,ecx | |
js 401421 <stdfmod+0x2b1> | |
test eax,eax | |
je 401421 <stdfmod+0x2b1> | |
add eax,edi | |
shl edx,0x17 | |
add eax,edx | |
movd xmm0,eax | |
pop rbx | |
pop rbp | |
ret | |
sub r9d,esi | |
cmp esi,r9d | |
jae 401321 <stdfmod+0x1b1> | |
jmp 401376 <stdfmod+0x206> | |
sub eax,esi | |
cmp esi,eax | |
jae 40134a <stdfmod+0x1da> | |
jmp 401380 <stdfmod+0x210> | |
nop DWORD PTR [rax+rax*1+0x0] | |
cmp eax,0x7f7fffff | |
ja 4013e9 <stdfmod+0x279> | |
and esi,0x7fffff | |
mov r11d,0x8 | |
mov eax,esi | |
mov esi,edx | |
lea edx,[r10-0x1] | |
and esi,0x7fffff | |
or eax,0x800000 | |
or esi,0x800000 | |
jmp 40126a <stdfmod+0xfa> | |
movd xmm0,edi | |
ret | |
bsr ecx,edx | |
mov eax,r10d | |
xor ecx,0x1f | |
sub ecx,0x8 | |
shl edx,cl | |
sub eax,ecx | |
js 40142d <stdfmod+0x2bd> | |
test edx,edx | |
je 40142d <stdfmod+0x2bd> | |
add edx,edi | |
shl eax,0x17 | |
add eax,edx | |
movd xmm0,eax | |
jmp 401373 <stdfmod+0x203> | |
mulss xmm0,xmm1 | |
cmp eax,0x7f800000 | |
ja 401193 <stdfmod+0x23> | |
divss xmm0,xmm0 | |
jmp 401140 <with_errnof.constprop.0> | |
xor edx,edx | |
div ecx | |
or edx,edi | |
movd xmm0,edx | |
ret | |
sub r8d,0x1 | |
bsr r11d,ecx | |
mov esi,ecx | |
xor edx,edx | |
xor r11d,0x1f | |
jmp 40126a <stdfmod+0xfa> | |
sub ecx,r10d | |
xor edx,edx | |
shr eax,cl | |
jmp 401368 <stdfmod+0x1f8> | |
sub ecx,r10d | |
xor eax,eax | |
shr edx,cl | |
jmp 4013dc <stdfmod+0x26c> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stdfmod_hotpath: | |
movd r8d,xmm0 | |
movd edx,xmm1 | |
mov eax,r8d | |
mov esi,edx | |
and eax,0x7fffffff | |
and esi,0x7fffffff | |
cmp eax,esi | |
jae 401460 <stdfmod_hotpath+0x20> | |
ret | |
xchg ax,ax | |
mov r9d,eax | |
mov r11d,esi | |
mov edi,r8d | |
shr r11d,0x17 | |
shr r9d,0x17 | |
and edi,0x80000000 | |
mov r10d,r9d | |
lea ecx,[r11-0x18] | |
sub r10d,r11d | |
cmp ecx,0xde | |
ja 4014d0 <stdfmod_hotpath+0x90> | |
cmp r10d,0x8 | |
jg 4014d0 <stdfmod_hotpath+0x90> | |
shl esi,0x8 | |
mov ecx,r10d | |
shl eax,0x8 | |
xor edx,edx | |
or esi,0x80000000 | |
or eax,0x80000000 | |
shr esi,cl | |
div esi | |
bsr ecx,edx | |
xor ecx,0x1f | |
shl edx,cl | |
add ecx,0x1 | |
shr edx,0x8 | |
sub r9d,ecx | |
shl r9d,0x17 | |
or edx,edi | |
lea eax,[r9+rdx*1] | |
movd xmm0,eax | |
ret | |
nop DWORD PTR [rax+0x0] | |
and edx,0x7fffff | |
mov eax,r8d | |
xor r8d,r8d | |
mov esi,edx | |
and eax,0x7fffff | |
or esi,0x800000 | |
or eax,0x800000 | |
tzcnt r8d,esi | |
cmp r10d,r8d | |
mov ecx,r8d | |
cmovle ecx,r10d | |
shr esi,cl | |
sub r10d,ecx | |
lea r11d,[r11+rcx*1-0x1] | |
mov ecx,0x8 | |
cmp r10d,ecx | |
cmovle ecx,r10d | |
xor edx,edx | |
shl eax,cl | |
div esi | |
mov r9d,edx | |
sub r10d,ecx | |
je 4015e2 <stdfmod_hotpath+0x1a2> | |
mov eax,0xffffffff | |
xor edx,edx | |
add r8d,0x8 | |
push rbp | |
div esi | |
push rbx | |
cmp r8d,r10d | |
jge 4015a2 <stdfmod_hotpath+0x162> | |
mov ebp,0x20 | |
sub ebp,r8d | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
data16 cs nop WORD PTR [rax+rax*1+0x0] | |
nop DWORD PTR [rax+rax*1+0x0] | |
mov edx,r9d | |
mov ebx,r9d | |
mov ecx,r8d | |
sub r10d,r8d | |
imul edx,eax | |
shl ebx,cl | |
mov ecx,ebp | |
shr edx,cl | |
imul edx,esi | |
sub ebx,edx | |
mov r9d,ebx | |
cmp r8d,r10d | |
jl 401580 <stdfmod_hotpath+0x140> | |
imul eax,r9d | |
mov ecx,r10d | |
mov edx,r9d | |
shl edx,cl | |
mov ecx,0x20 | |
sub ecx,r10d | |
shr eax,cl | |
imul eax,esi | |
sub edx,eax | |
mov eax,r11d | |
bsr ecx,edx | |
xor ecx,0x1f | |
sub ecx,0x8 | |
shl edx,cl | |
sub eax,ecx | |
js 401604 <stdfmod_hotpath+0x1c4> | |
test edx,edx | |
je 401604 <stdfmod_hotpath+0x1c4> | |
shl eax,0x17 | |
add edx,eax | |
pop rbx | |
pop rbp | |
lea eax,[rdx+rdi*1] | |
movd xmm0,eax | |
ret | |
bsr ecx,edx | |
mov eax,r11d | |
xor ecx,0x1f | |
sub ecx,0x8 | |
shl edx,cl | |
sub eax,ecx | |
js 40160d <stdfmod_hotpath+0x1cd> | |
test edx,edx | |
je 40160d <stdfmod_hotpath+0x1cd> | |
add edx,edi | |
shl eax,0x17 | |
add eax,edx | |
movd xmm0,eax | |
ret | |
sub ecx,r11d | |
xor eax,eax | |
shr edx,cl | |
jmp 4015d6 <stdfmod_hotpath+0x196> | |
sub ecx,r11d | |
xor eax,eax | |
shr edx,cl | |
jmp 4015f8 <stdfmod_hotpath+0x1b8> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Floating-point remainder function. | |
Copyright (C) 2023-2024 Free Software Foundation, Inc. | |
This file is part of the GNU C Library. | |
The GNU C Library is free software; you can redistribute it and/or | |
modify it under the terms of the GNU Lesser General Public | |
License as published by the Free Software Foundation; either | |
version 2.1 of the License, or (at your option) any later version. | |
The GNU C Library is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public | |
License along with the GNU C Library; if not, see | |
<https://www.gnu.org/licenses/>. */ | |
#include <math.h> | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <errno.h> | |
#define BIT_WIDTH 32 | |
#define MANTISSA_WIDTH 23 | |
#define EXPONENT_WIDTH 8 | |
#define MANTISSA_MASK 0x007fffff | |
#define EXPONENT_MASK 0x7f800000 | |
#define EXP_MANT_MASK 0x7fffffff | |
#define QUIET_NAN_MASK 0x00400000 | |
#define SIGN_MASK 0x80000000 | |
#define NOINLINE __attribute__ ((noinline)) | |
NOINLINE static float | |
with_errnof (float y, int e) | |
{ | |
errno = e; | |
return y; | |
} | |
float | |
__math_edomf (float y) | |
{ | |
return with_errnof (y, EDOM); | |
} | |
static inline bool | |
is_nan (uint32_t x) | |
{ | |
return (x & EXP_MANT_MASK) > EXPONENT_MASK; | |
} | |
static inline uint32_t | |
get_mantissa (uint32_t x) | |
{ | |
return x & MANTISSA_MASK; | |
} | |
static inline float | |
asfloat (uint32_t i) | |
{ | |
union | |
{ | |
uint32_t i; | |
float f; | |
} u = { | |
i}; | |
return u.f; | |
} | |
/* Convert integer number X, unbiased exponent EP, and sign S to double: | |
result = X * 2^(EP+1 - exponent_bias) | |
NB: zero is not supported. */ | |
static inline double | |
make_float (uint32_t x, int ep, uint32_t s) | |
{ | |
int lz = __builtin_clz (x) - EXPONENT_WIDTH; | |
x <<= lz; | |
ep -= lz; | |
if (__glibc_unlikely (ep < 0 || x == 0)) | |
{ | |
x >>= -ep; | |
ep = 0; | |
} | |
return asfloat ( | |
s + x + (ep << MANTISSA_WIDTH)); | |
} | |
static inline uint32_t | |
asuint (float f) | |
{ | |
union | |
{ | |
float f; | |
uint32_t i; | |
} u = {f}; | |
return u.i; | |
} | |
/* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the | |
simplest implementation is: | |
mx * 2^ex == 2 * mx * 2^(ex - 1) | |
or | |
while (ex > ey) | |
{ | |
mx *= 2; | |
--ex; | |
mx %= my; | |
} | |
With the mathematical equivalence of: | |
r == x % y == (x % (N * y)) % y | |
And with mx/my being mantissa of a single floating point number (which uses | |
less bits than the storage type), on each step the argument reduction can | |
be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus | |
the implicit one bit): | |
mx * 2^ex == 2^8 * mx * 2^(ex - 8) | |
or | |
while (ex > ey) | |
{ | |
mx << 8; | |
ex -= 8; | |
mx %= my; | |
} | |
Special cases: | |
- If x or y is a NaN, a NaN is returned. | |
- If x is an infinity, or y is zero, a NaN is returned and EDOM is set. | |
- If x is +0/-0, and y is not zero, +0/-0 is returned. */ | |
float | |
stdfmod (float x, float y) | |
{ | |
uint32_t hx = asuint ( | |
x); | |
uint32_t hy = asuint ( | |
y); | |
uint32_t sx = hx & SIGN_MASK; | |
/* Get |x| and |y|. */ | |
hx ^= sx; | |
hy &= ~SIGN_MASK; | |
if (__glibc_likely (hx < hy)) | |
{ | |
/* If y is a NaN, return a NaN. */ | |
if (__glibc_unlikely (hy > EXPONENT_MASK)) | |
return x * y; | |
return x; | |
} | |
int ex = hx >> MANTISSA_WIDTH; | |
int ey = hy >> MANTISSA_WIDTH; | |
int exp_diff = ex - ey; | |
/* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN | |
and |x%y| not denormal. */ | |
if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH | |
&& ey > MANTISSA_WIDTH | |
&& exp_diff <= EXPONENT_WIDTH)) | |
{ | |
uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK; | |
uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK; | |
mx %= (my >> exp_diff); | |
if (__glibc_unlikely (mx == 0)) | |
return asfloat ( | |
sx); | |
int shift = __builtin_clz (mx); | |
ex -= shift + 1; | |
mx <<= shift; | |
mx = sx | (mx >> EXPONENT_WIDTH); | |
return asfloat ( | |
mx + ((uint32_t)ex << MANTISSA_WIDTH)); | |
} | |
if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK)) | |
{ | |
/* If x is a NaN, return a NaN. */ | |
if (hx > EXPONENT_MASK) | |
return x * y; | |
/* If x is an infinity or y is zero, return a NaN and set EDOM. */ | |
return __math_edomf ( | |
(x * y) / (x * y)); | |
} | |
/* Special case, both x and y are denormal. */ | |
if (__glibc_unlikely (ex == 0)) | |
return asfloat ( | |
sx | hx % hy); | |
/* Extract normalized mantissas - hx is not denormal and hy != 0. */ | |
uint32_t mx = get_mantissa ( | |
hx) | (MANTISSA_MASK + 1); | |
uint32_t my = get_mantissa ( | |
hy) | (MANTISSA_MASK + 1); | |
int lead_zeros_my = EXPONENT_WIDTH; | |
ey--; | |
/* Special case for denormal y. */ | |
if (__glibc_unlikely (ey < 0)) | |
{ | |
my = hy; | |
ey = 0; | |
exp_diff--; | |
lead_zeros_my = __builtin_clz (my); | |
} | |
int tail_zeros_my = __builtin_ctz (my); | |
int sides_zeroes = lead_zeros_my + tail_zeros_my; | |
int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; | |
my >>= right_shift; | |
exp_diff -= right_shift; | |
ey += right_shift; | |
int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH; | |
mx <<= left_shift; | |
exp_diff -= left_shift; | |
mx %= my; | |
if (__glibc_unlikely (mx == 0)) | |
return asfloat ( | |
sx); | |
if (exp_diff == 0) | |
return make_float (mx, ey, | |
sx); | |
/* Multiplication with the inverse is faster than repeated modulo. */ | |
uint32_t inv_hy = UINT32_MAX / my; | |
while (exp_diff > sides_zeroes) { | |
exp_diff -= sides_zeroes; | |
uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes); | |
mx <<= sides_zeroes; | |
mx -= hd * my; | |
while (__glibc_unlikely (mx > my)) | |
mx -= my; | |
} | |
uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff); | |
mx <<= exp_diff; | |
mx -= hd * my; | |
while (__glibc_unlikely (mx > my)) | |
mx -= my; | |
return make_float (mx, ey, | |
sx); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment