Skip to content

Instantly share code, notes, and snippets.

@eira-fransham
Last active April 11, 2025 13:56
Show Gist options
  • Save eira-fransham/80a923edc4a2c9701123677dc5f78ce2 to your computer and use it in GitHub Desktop.
Save eira-fransham/80a923edc4a2c9701123677dc5f78ce2 to your computer and use it in GitHub Desktop.
stdfmod:
movd esi,xmm0
movd edx,xmm1
mov eax,esi
mov ecx,edx
and eax,0x7fffffff
and ecx,0x7fffffff
cmp eax,ecx
jae 4011a0 <stdfmod+0x30>
cmp ecx,0x7f800000
ja 401198 <stdfmod+0x28>
ret
nop DWORD PTR [rax+0x0]
mulss xmm0,xmm1
ret
nop DWORD PTR [rax]
mov r9d,eax
mov r10d,ecx
mov edi,esi
shr r10d,0x17
shr r9d,0x17
and edi,0x80000000
mov r8d,r9d
lea r11d,[r10-0x18]
sub r8d,r10d
cmp r11d,0xde
ja 401220 <stdfmod+0xb0>
cmp r8d,0x8
jg 401390 <stdfmod+0x220>
shl ecx,0x8
shl eax,0x8
mov edx,ecx
mov ecx,r8d
or eax,0x80000000
or edx,0x80000000
shr edx,cl
mov ecx,edx
xor edx,edx
div ecx
test edx,edx
je 4013c1 <stdfmod+0x251>
bsr ecx,edx
xor ecx,0x1f
shl edx,cl
add ecx,0x1
shr edx,0x8
sub r9d,ecx
shl r9d,0x17
or edx,edi
lea eax,[r9+rdx*1]
movd xmm0,eax
ret
nop DWORD PTR [rax+0x0]
test ecx,ecx
je 4013e9 <stdfmod+0x279>
cmp eax,0x7f7fffff
ja 4013e9 <stdfmod+0x279>
test r9d,r9d
je 401401 <stdfmod+0x291>
and esi,0x7fffff
mov r11d,0x8
mov eax,esi
mov esi,edx
lea edx,[r10-0x1]
and esi,0x7fffff
or eax,0x800000
or esi,0x800000
test r10d,r10d
je 40140c <stdfmod+0x29c>
push rbp
movd xmm0,edi
push rbx
xor ebx,ebx
tzcnt ebx,esi
cmp r8d,ebx
mov ecx,ebx
cmovle ecx,r8d
sub r8d,ecx
shr esi,cl
lea r10d,[rdx+rcx*1]
mov ecx,0x8
cmp r8d,ecx
cmovle ecx,r8d
xor edx,edx
shl eax,cl
sub r8d,ecx
div esi
mov r9d,edx
test edx,edx
je 401373 <stdfmod+0x203>
test r8d,r8d
je 4013c6 <stdfmod+0x256>
mov eax,0xffffffff
xor edx,edx
add r11d,ebx
div esi
cmp r11d,r8d
jge 401326 <stdfmod+0x1b6>
mov ebp,0x20
sub ebp,r11d
data16 cs nop WORD PTR [rax+rax*1+0x0]
data16 cs nop WORD PTR [rax+rax*1+0x0]
data16 cs nop WORD PTR [rax+rax*1+0x0]
data16 cs nop WORD PTR [rax+rax*1+0x0]
cs nop WORD PTR [rax+rax*1+0x0]
mov edx,r9d
mov ebx,r9d
mov ecx,r11d
sub r8d,r11d
imul edx,eax
shl ebx,cl
mov ecx,ebp
shr edx,cl
imul edx,esi
sub ebx,edx
mov r9d,ebx
cmp esi,ebx
jb 401376 <stdfmod+0x206>
cmp r11d,r8d
jl 401300 <stdfmod+0x190>
mov edx,r9d
imul r9d,eax
mov ecx,r8d
shl edx,cl
mov ecx,0x20
sub ecx,r8d
mov eax,edx
shr r9d,cl
imul r9d,esi
sub eax,r9d
cmp esi,eax
jb 401380 <stdfmod+0x210>
bsr ecx,eax
mov edx,r10d
xor ecx,0x1f
sub ecx,0x8
shl eax,cl
sub edx,ecx
js 401421 <stdfmod+0x2b1>
test eax,eax
je 401421 <stdfmod+0x2b1>
add eax,edi
shl edx,0x17
add eax,edx
movd xmm0,eax
pop rbx
pop rbp
ret
sub r9d,esi
cmp esi,r9d
jae 401321 <stdfmod+0x1b1>
jmp 401376 <stdfmod+0x206>
sub eax,esi
cmp esi,eax
jae 40134a <stdfmod+0x1da>
jmp 401380 <stdfmod+0x210>
nop DWORD PTR [rax+rax*1+0x0]
cmp eax,0x7f7fffff
ja 4013e9 <stdfmod+0x279>
and esi,0x7fffff
mov r11d,0x8
mov eax,esi
mov esi,edx
lea edx,[r10-0x1]
and esi,0x7fffff
or eax,0x800000
or esi,0x800000
jmp 40126a <stdfmod+0xfa>
movd xmm0,edi
ret
bsr ecx,edx
mov eax,r10d
xor ecx,0x1f
sub ecx,0x8
shl edx,cl
sub eax,ecx
js 40142d <stdfmod+0x2bd>
test edx,edx
je 40142d <stdfmod+0x2bd>
add edx,edi
shl eax,0x17
add eax,edx
movd xmm0,eax
jmp 401373 <stdfmod+0x203>
mulss xmm0,xmm1
cmp eax,0x7f800000
ja 401193 <stdfmod+0x23>
divss xmm0,xmm0
jmp 401140 <with_errnof.constprop.0>
xor edx,edx
div ecx
or edx,edi
movd xmm0,edx
ret
sub r8d,0x1
bsr r11d,ecx
mov esi,ecx
xor edx,edx
xor r11d,0x1f
jmp 40126a <stdfmod+0xfa>
sub ecx,r10d
xor edx,edx
shr eax,cl
jmp 401368 <stdfmod+0x1f8>
sub ecx,r10d
xor eax,eax
shr edx,cl
jmp 4013dc <stdfmod+0x26c>
stdfmod_hotpath:
movd r8d,xmm0
movd edx,xmm1
mov eax,r8d
mov esi,edx
and eax,0x7fffffff
and esi,0x7fffffff
cmp eax,esi
jae 401460 <stdfmod_hotpath+0x20>
ret
xchg ax,ax
mov r9d,eax
mov r11d,esi
mov edi,r8d
shr r11d,0x17
shr r9d,0x17
and edi,0x80000000
mov r10d,r9d
lea ecx,[r11-0x18]
sub r10d,r11d
cmp ecx,0xde
ja 4014d0 <stdfmod_hotpath+0x90>
cmp r10d,0x8
jg 4014d0 <stdfmod_hotpath+0x90>
shl esi,0x8
mov ecx,r10d
shl eax,0x8
xor edx,edx
or esi,0x80000000
or eax,0x80000000
shr esi,cl
div esi
bsr ecx,edx
xor ecx,0x1f
shl edx,cl
add ecx,0x1
shr edx,0x8
sub r9d,ecx
shl r9d,0x17
or edx,edi
lea eax,[r9+rdx*1]
movd xmm0,eax
ret
nop DWORD PTR [rax+0x0]
and edx,0x7fffff
mov eax,r8d
xor r8d,r8d
mov esi,edx
and eax,0x7fffff
or esi,0x800000
or eax,0x800000
tzcnt r8d,esi
cmp r10d,r8d
mov ecx,r8d
cmovle ecx,r10d
shr esi,cl
sub r10d,ecx
lea r11d,[r11+rcx*1-0x1]
mov ecx,0x8
cmp r10d,ecx
cmovle ecx,r10d
xor edx,edx
shl eax,cl
div esi
mov r9d,edx
sub r10d,ecx
je 4015e2 <stdfmod_hotpath+0x1a2>
mov eax,0xffffffff
xor edx,edx
add r8d,0x8
push rbp
div esi
push rbx
cmp r8d,r10d
jge 4015a2 <stdfmod_hotpath+0x162>
mov ebp,0x20
sub ebp,r8d
data16 cs nop WORD PTR [rax+rax*1+0x0]
data16 cs nop WORD PTR [rax+rax*1+0x0]
data16 cs nop WORD PTR [rax+rax*1+0x0]
data16 cs nop WORD PTR [rax+rax*1+0x0]
data16 cs nop WORD PTR [rax+rax*1+0x0]
nop DWORD PTR [rax+rax*1+0x0]
mov edx,r9d
mov ebx,r9d
mov ecx,r8d
sub r10d,r8d
imul edx,eax
shl ebx,cl
mov ecx,ebp
shr edx,cl
imul edx,esi
sub ebx,edx
mov r9d,ebx
cmp r8d,r10d
jl 401580 <stdfmod_hotpath+0x140>
imul eax,r9d
mov ecx,r10d
mov edx,r9d
shl edx,cl
mov ecx,0x20
sub ecx,r10d
shr eax,cl
imul eax,esi
sub edx,eax
mov eax,r11d
bsr ecx,edx
xor ecx,0x1f
sub ecx,0x8
shl edx,cl
sub eax,ecx
js 401604 <stdfmod_hotpath+0x1c4>
test edx,edx
je 401604 <stdfmod_hotpath+0x1c4>
shl eax,0x17
add edx,eax
pop rbx
pop rbp
lea eax,[rdx+rdi*1]
movd xmm0,eax
ret
bsr ecx,edx
mov eax,r11d
xor ecx,0x1f
sub ecx,0x8
shl edx,cl
sub eax,ecx
js 40160d <stdfmod_hotpath+0x1cd>
test edx,edx
je 40160d <stdfmod_hotpath+0x1cd>
add edx,edi
shl eax,0x17
add eax,edx
movd xmm0,eax
ret
sub ecx,r11d
xor eax,eax
shr edx,cl
jmp 4015d6 <stdfmod_hotpath+0x196>
sub ecx,r11d
xor eax,eax
shr edx,cl
jmp 4015f8 <stdfmod_hotpath+0x1b8>
/* Floating-point remainder function.
Copyright (C) 2023-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <math.h>
#include <stdint.h>
#include <stdbool.h>
#include <errno.h>
#define BIT_WIDTH 32
#define MANTISSA_WIDTH 23
#define EXPONENT_WIDTH 8
#define MANTISSA_MASK 0x007fffff
#define EXPONENT_MASK 0x7f800000
#define EXP_MANT_MASK 0x7fffffff
#define QUIET_NAN_MASK 0x00400000
#define SIGN_MASK 0x80000000
#define NOINLINE __attribute__ ((noinline))
NOINLINE static float
with_errnof (float y, int e)
{
errno = e;
return y;
}
float
__math_edomf (float y)
{
return with_errnof (y, EDOM);
}
static inline bool
is_nan (uint32_t x)
{
return (x & EXP_MANT_MASK) > EXPONENT_MASK;
}
static inline uint32_t
get_mantissa (uint32_t x)
{
return x & MANTISSA_MASK;
}
static inline float
asfloat (uint32_t i)
{
union
{
uint32_t i;
float f;
} u = {
i};
return u.f;
}
/* Convert integer number X, unbiased exponent EP, and sign S to double:
result = X * 2^(EP+1 - exponent_bias)
NB: zero is not supported. */
static inline double
make_float (uint32_t x, int ep, uint32_t s)
{
int lz = __builtin_clz (x) - EXPONENT_WIDTH;
x <<= lz;
ep -= lz;
if (__glibc_unlikely (ep < 0 || x == 0))
{
x >>= -ep;
ep = 0;
}
return asfloat (
s + x + (ep << MANTISSA_WIDTH));
}
static inline uint32_t
asuint (float f)
{
union
{
float f;
uint32_t i;
} u = {f};
return u.i;
}
/* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the
simplest implementation is:
mx * 2^ex == 2 * mx * 2^(ex - 1)
or
while (ex > ey)
{
mx *= 2;
--ex;
mx %= my;
}
With the mathematical equivalence of:
r == x % y == (x % (N * y)) % y
And with mx/my being mantissa of a single floating point number (which uses
less bits than the storage type), on each step the argument reduction can
be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus
the implicit one bit):
mx * 2^ex == 2^8 * mx * 2^(ex - 8)
or
while (ex > ey)
{
mx << 8;
ex -= 8;
mx %= my;
}
Special cases:
- If x or y is a NaN, a NaN is returned.
- If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
- If x is +0/-0, and y is not zero, +0/-0 is returned. */
float
stdfmod (float x, float y)
{
uint32_t hx = asuint (
x);
uint32_t hy = asuint (
y);
uint32_t sx = hx & SIGN_MASK;
/* Get |x| and |y|. */
hx ^= sx;
hy &= ~SIGN_MASK;
if (__glibc_likely (hx < hy))
{
/* If y is a NaN, return a NaN. */
if (__glibc_unlikely (hy > EXPONENT_MASK))
return x * y;
return x;
}
int ex = hx >> MANTISSA_WIDTH;
int ey = hy >> MANTISSA_WIDTH;
int exp_diff = ex - ey;
/* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN
and |x%y| not denormal. */
if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
&& ey > MANTISSA_WIDTH
&& exp_diff <= EXPONENT_WIDTH))
{
uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
mx %= (my >> exp_diff);
if (__glibc_unlikely (mx == 0))
return asfloat (
sx);
int shift = __builtin_clz (mx);
ex -= shift + 1;
mx <<= shift;
mx = sx | (mx >> EXPONENT_WIDTH);
return asfloat (
mx + ((uint32_t)ex << MANTISSA_WIDTH));
}
if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
{
/* If x is a NaN, return a NaN. */
if (hx > EXPONENT_MASK)
return x * y;
/* If x is an infinity or y is zero, return a NaN and set EDOM. */
return __math_edomf (
(x * y) / (x * y));
}
/* Special case, both x and y are denormal. */
if (__glibc_unlikely (ex == 0))
return asfloat (
sx | hx % hy);
/* Extract normalized mantissas - hx is not denormal and hy != 0. */
uint32_t mx = get_mantissa (
hx) | (MANTISSA_MASK + 1);
uint32_t my = get_mantissa (
hy) | (MANTISSA_MASK + 1);
int lead_zeros_my = EXPONENT_WIDTH;
ey--;
/* Special case for denormal y. */
if (__glibc_unlikely (ey < 0))
{
my = hy;
ey = 0;
exp_diff--;
lead_zeros_my = __builtin_clz (my);
}
int tail_zeros_my = __builtin_ctz (my);
int sides_zeroes = lead_zeros_my + tail_zeros_my;
int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
my >>= right_shift;
exp_diff -= right_shift;
ey += right_shift;
int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH;
mx <<= left_shift;
exp_diff -= left_shift;
mx %= my;
if (__glibc_unlikely (mx == 0))
return asfloat (
sx);
if (exp_diff == 0)
return make_float (mx, ey,
sx);
/* Multiplication with the inverse is faster than repeated modulo. */
uint32_t inv_hy = UINT32_MAX / my;
while (exp_diff > sides_zeroes) {
exp_diff -= sides_zeroes;
uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes);
mx <<= sides_zeroes;
mx -= hd * my;
while (__glibc_unlikely (mx > my))
mx -= my;
}
uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff);
mx <<= exp_diff;
mx -= hd * my;
while (__glibc_unlikely (mx > my))
mx -= my;
return make_float (mx, ey,
sx);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment