eira-fransham · April 11, 2025 13:56
diff --git a/fmod.asm b/fmod.asm
 stdfmod:
 movd   esi,xmm0
 movd   edx,xmm1
 mov    eax,esi
 mov    ecx,edx
 and    eax,0x7fffffff
 and    ecx,0x7fffffff
 cmp    eax,ecx
 jae    4011a0 <stdfmod+0x30>
 cmp    ecx,0x7f800000
 ja     401198 <stdfmod+0x28>
 ret
 nop    DWORD PTR [rax+0x0]
 mulss  xmm0,xmm1
 ret
 nop    DWORD PTR [rax]
 mov    r9d,eax
 mov    r10d,ecx
 mov    edi,esi
 shr    r10d,0x17
 shr    r9d,0x17
 and    edi,0x80000000
 mov    r8d,r9d
 lea    r11d,[r10-0x18]
 sub    r8d,r10d
 cmp    r11d,0xde
 ja     401220 <stdfmod+0xb0>
 cmp    r8d,0x8
 jg     401390 <stdfmod+0x220>
 shl    ecx,0x8
 shl    eax,0x8
 mov    edx,ecx
 mov    ecx,r8d
 or     eax,0x80000000
 or     edx,0x80000000
 shr    edx,cl
 mov    ecx,edx
 xor    edx,edx
 div    ecx
 test   edx,edx
 je     4013c1 <stdfmod+0x251>
 bsr    ecx,edx
 xor    ecx,0x1f
 shl    edx,cl
 add    ecx,0x1
 shr    edx,0x8
 sub    r9d,ecx
 shl    r9d,0x17
 or     edx,edi
 lea    eax,[r9+rdx*1]
 movd   xmm0,eax
 ret
 nop    DWORD PTR [rax+0x0]
 test   ecx,ecx
 je     4013e9 <stdfmod+0x279>
 cmp    eax,0x7f7fffff
 ja     4013e9 <stdfmod+0x279>
 test   r9d,r9d
 je     401401 <stdfmod+0x291>
 and    esi,0x7fffff
 mov    r11d,0x8
 mov    eax,esi
 mov    esi,edx
 lea    edx,[r10-0x1]
 and    esi,0x7fffff
 or     eax,0x800000
 or     esi,0x800000
 test   r10d,r10d
 je     40140c <stdfmod+0x29c>
 push   rbp
 movd   xmm0,edi
 push   rbx
 xor    ebx,ebx
 tzcnt  ebx,esi
 cmp    r8d,ebx
 mov    ecx,ebx
 cmovle ecx,r8d
 sub    r8d,ecx
 shr    esi,cl
 lea    r10d,[rdx+rcx*1]
 mov    ecx,0x8
 cmp    r8d,ecx
 cmovle ecx,r8d
 xor    edx,edx
 shl    eax,cl
 sub    r8d,ecx
 div    esi
 mov    r9d,edx
 test   edx,edx
 je     401373 <stdfmod+0x203>
 test   r8d,r8d
 je     4013c6 <stdfmod+0x256>
 mov    eax,0xffffffff
 xor    edx,edx
 add    r11d,ebx
 div    esi
 cmp    r11d,r8d
 jge    401326 <stdfmod+0x1b6>
 mov    ebp,0x20
 sub    ebp,r11d
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 cs nop WORD PTR [rax+rax*1+0x0]
 mov    edx,r9d
 mov    ebx,r9d
 mov    ecx,r11d
 sub    r8d,r11d
 imul   edx,eax
 shl    ebx,cl
 mov    ecx,ebp
 shr    edx,cl
 imul   edx,esi
 sub    ebx,edx
 mov    r9d,ebx
 cmp    esi,ebx
 jb     401376 <stdfmod+0x206>
 cmp    r11d,r8d
 jl     401300 <stdfmod+0x190>
 mov    edx,r9d
 imul   r9d,eax
 mov    ecx,r8d
 shl    edx,cl
 mov    ecx,0x20
 sub    ecx,r8d
 mov    eax,edx
 shr    r9d,cl
 imul   r9d,esi
 sub    eax,r9d
 cmp    esi,eax
 jb     401380 <stdfmod+0x210>
 bsr    ecx,eax
 mov    edx,r10d
 xor    ecx,0x1f
 sub    ecx,0x8
 shl    eax,cl
 sub    edx,ecx
 js     401421 <stdfmod+0x2b1>
 test   eax,eax
 je     401421 <stdfmod+0x2b1>
 add    eax,edi
 shl    edx,0x17
 add    eax,edx
 movd   xmm0,eax
 pop    rbx
 pop    rbp
 ret
 sub    r9d,esi
 cmp    esi,r9d
 jae    401321 <stdfmod+0x1b1>
 jmp    401376 <stdfmod+0x206>
 sub    eax,esi
 cmp    esi,eax
 jae    40134a <stdfmod+0x1da>
 jmp    401380 <stdfmod+0x210>
 nop    DWORD PTR [rax+rax*1+0x0]
 cmp    eax,0x7f7fffff
 ja     4013e9 <stdfmod+0x279>
 and    esi,0x7fffff
 mov    r11d,0x8
 mov    eax,esi
 mov    esi,edx
 lea    edx,[r10-0x1]
 and    esi,0x7fffff
 or     eax,0x800000
 or     esi,0x800000
 jmp    40126a <stdfmod+0xfa>
 movd   xmm0,edi
 ret
 bsr    ecx,edx
 mov    eax,r10d
 xor    ecx,0x1f
 sub    ecx,0x8
 shl    edx,cl
 sub    eax,ecx
 js     40142d <stdfmod+0x2bd>
 test   edx,edx
 je     40142d <stdfmod+0x2bd>
 add    edx,edi
 shl    eax,0x17
 add    eax,edx
 movd   xmm0,eax
 jmp    401373 <stdfmod+0x203>
 mulss  xmm0,xmm1
 cmp    eax,0x7f800000
 ja     401193 <stdfmod+0x23>
 divss  xmm0,xmm0
 jmp    401140 <with_errnof.constprop.0>
 xor    edx,edx
 div    ecx
 or     edx,edi
 movd   xmm0,edx
 ret
 sub    r8d,0x1
 bsr    r11d,ecx
 mov    esi,ecx
 xor    edx,edx
 xor    r11d,0x1f
 jmp    40126a <stdfmod+0xfa>
 sub    ecx,r10d
 xor    edx,edx
 shr    eax,cl
 jmp    401368 <stdfmod+0x1f8>
 sub    ecx,r10d
 xor    eax,eax
 shr    edx,cl
 jmp    4013dc <stdfmod+0x26c>
diff --git a/hotpath.asm b/hotpath.asm
 stdfmod_hotpath:
 movd   r8d,xmm0
 movd   edx,xmm1
 mov    eax,r8d
 mov    esi,edx
 and    eax,0x7fffffff
 and    esi,0x7fffffff
 cmp    eax,esi
 jae    401460 <stdfmod_hotpath+0x20>
 ret
 xchg   ax,ax
 mov    r9d,eax
 mov    r11d,esi
 mov    edi,r8d
 shr    r11d,0x17
 shr    r9d,0x17
 and    edi,0x80000000
 mov    r10d,r9d
 lea    ecx,[r11-0x18]
 sub    r10d,r11d
 cmp    ecx,0xde
 ja     4014d0 <stdfmod_hotpath+0x90>
 cmp    r10d,0x8
 jg     4014d0 <stdfmod_hotpath+0x90>
 shl    esi,0x8
 mov    ecx,r10d
 shl    eax,0x8
 xor    edx,edx
 or     esi,0x80000000
 or     eax,0x80000000
 shr    esi,cl
 div    esi
 bsr    ecx,edx
 xor    ecx,0x1f
 shl    edx,cl
 add    ecx,0x1
 shr    edx,0x8
 sub    r9d,ecx
 shl    r9d,0x17
 or     edx,edi
 lea    eax,[r9+rdx*1]
 movd   xmm0,eax
 ret
 nop    DWORD PTR [rax+0x0]
 and    edx,0x7fffff
 mov    eax,r8d
 xor    r8d,r8d
 mov    esi,edx
 and    eax,0x7fffff
 or     esi,0x800000
 or     eax,0x800000
 tzcnt  r8d,esi
 cmp    r10d,r8d
 mov    ecx,r8d
 cmovle ecx,r10d
 shr    esi,cl
 sub    r10d,ecx
 lea    r11d,[r11+rcx*1-0x1]
 mov    ecx,0x8
 cmp    r10d,ecx
 cmovle ecx,r10d
 xor    edx,edx
 shl    eax,cl
 div    esi
 mov    r9d,edx
 sub    r10d,ecx
 je     4015e2 <stdfmod_hotpath+0x1a2>
 mov    eax,0xffffffff
 xor    edx,edx
 add    r8d,0x8
 push   rbp
 div    esi
 push   rbx
 cmp    r8d,r10d
 jge    4015a2 <stdfmod_hotpath+0x162>
 mov    ebp,0x20
 sub    ebp,r8d
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 data16 cs nop WORD PTR [rax+rax*1+0x0]
 nop    DWORD PTR [rax+rax*1+0x0]
 mov    edx,r9d
 mov    ebx,r9d
 mov    ecx,r8d
 sub    r10d,r8d
 imul   edx,eax
 shl    ebx,cl
 mov    ecx,ebp
 shr    edx,cl
 imul   edx,esi
 sub    ebx,edx
 mov    r9d,ebx
 cmp    r8d,r10d
 jl     401580 <stdfmod_hotpath+0x140>
 imul   eax,r9d
 mov    ecx,r10d
 mov    edx,r9d
 shl    edx,cl
 mov    ecx,0x20
 sub    ecx,r10d
 shr    eax,cl
 imul   eax,esi
 sub    edx,eax
 mov    eax,r11d
 bsr    ecx,edx
 xor    ecx,0x1f
 sub    ecx,0x8
 shl    edx,cl
 sub    eax,ecx
 js     401604 <stdfmod_hotpath+0x1c4>
 test   edx,edx
 je     401604 <stdfmod_hotpath+0x1c4>
 shl    eax,0x17
 add    edx,eax
 pop    rbx
 pop    rbp
 lea    eax,[rdx+rdi*1]
 movd   xmm0,eax
 ret
 bsr    ecx,edx
 mov    eax,r11d
 xor    ecx,0x1f
 sub    ecx,0x8
 shl    edx,cl
 sub    eax,ecx
 js     40160d <stdfmod_hotpath+0x1cd>
 test   edx,edx
 je     40160d <stdfmod_hotpath+0x1cd>
 add    edx,edi
 shl    eax,0x17
 add    eax,edx
 movd   xmm0,eax
 ret
 sub    ecx,r11d
 xor    eax,eax
 shr    edx,cl
 jmp    4015d6 <stdfmod_hotpath+0x196>
 sub    ecx,r11d
 xor    eax,eax
 shr    edx,cl
 jmp    4015f8 <stdfmod_hotpath+0x1b8>
diff --git a/input.c b/input.c
 /* Floating-point remainder function.
   Copyright (C) 2023-2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #include <math.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <errno.h>

 #define BIT_WIDTH       32
 #define MANTISSA_WIDTH  23
 #define EXPONENT_WIDTH  8
 #define MANTISSA_MASK   0x007fffff
 #define EXPONENT_MASK   0x7f800000
 #define EXP_MANT_MASK   0x7fffffff
 #define QUIET_NAN_MASK  0x00400000
 #define SIGN_MASK       0x80000000

 #define NOINLINE __attribute__ ((noinline))

 NOINLINE static float
 with_errnof (float y, int e)
 {
  errno = e;
  return y;
 }

 float
 __math_edomf (float y)
 {
  return with_errnof (y, EDOM);
 }

 static inline bool
 is_nan (uint32_t x)
 {
  return (x & EXP_MANT_MASK) > EXPONENT_MASK;
 }

 static inline uint32_t
 get_mantissa (uint32_t x)
 {
  return x & MANTISSA_MASK;
 }

 static inline float
 asfloat (uint32_t i)
 {
  union
  {
    uint32_t i;
    float f;
  } u = {
 i};
  return u.f;
 }

 /* Convert integer number X, unbiased exponent EP, and sign S to double:
   result = X * 2^(EP+1 - exponent_bias)
   NB: zero is not supported.  */
 static inline double
 make_float (uint32_t x, int ep, uint32_t s)
 {
  int lz = __builtin_clz (x) - EXPONENT_WIDTH;
  x <<= lz;
  ep -= lz;
  if (__glibc_unlikely (ep < 0 || x == 0))
    {
      x >>= -ep;
      ep = 0;
    }
  return asfloat (
 s + x + (ep << MANTISSA_WIDTH));
 }

 static inline uint32_t
 asuint (float f)
 {
  union
  {
    float f;
    uint32_t i;
  } u = {f};
  return u.i;
 }
 /* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the
   simplest implementation is:
   mx * 2^ex == 2 * mx * 2^(ex - 1)
   or
   while (ex > ey)
     {
       mx *= 2;
       --ex;
       mx %= my;
     }
   With the mathematical equivalence of:
   r == x % y == (x % (N * y)) % y
   And with mx/my being mantissa of a single floating point number (which uses
   less bits than the storage type), on each step the argument reduction can
   be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus
   the implicit one bit):
   mx * 2^ex == 2^8 * mx * 2^(ex - 8)
   or
   while (ex > ey)
     {
       mx << 8;
       ex -= 8;
       mx %= my;
     }
   Special cases:
     - If x or y is a NaN, a NaN is returned.
     - If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
 float
 stdfmod (float x, float y)
 {
  uint32_t hx = asuint (
 x);
  uint32_t hy = asuint (
 y);
  uint32_t sx = hx & SIGN_MASK;
  /* Get |x| and |y|.  */
  hx ^= sx;
  hy &= ~SIGN_MASK;
  if (__glibc_likely (hx < hy))
    {
      /* If y is a NaN, return a NaN.  */
      if (__glibc_unlikely (hy > EXPONENT_MASK))
 	return x * y;
      return x;
    }
  int ex = hx >> MANTISSA_WIDTH;
  int ey = hy >> MANTISSA_WIDTH;
  int exp_diff = ex - ey;
  /* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN
     and |x%y| not denormal.  */
  if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
 		     && ey > MANTISSA_WIDTH
 		     && exp_diff <= EXPONENT_WIDTH))
    {
      uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
      uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
      mx %= (my >> exp_diff);
      if (__glibc_unlikely (mx == 0))
 	return asfloat (
 sx);
      int shift = __builtin_clz (mx);
      ex -= shift + 1;
      mx <<= shift;
      mx = sx | (mx >> EXPONENT_WIDTH);
      return asfloat (
 mx + ((uint32_t)ex << MANTISSA_WIDTH));
    }
  if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
    {
      /* If x is a NaN, return a NaN.  */
      if (hx > EXPONENT_MASK)
 	return x * y;
      /* If x is an infinity or y is zero, return a NaN and set EDOM.  */
      return __math_edomf (
 (x * y) / (x * y));
    }
  /* Special case, both x and y are denormal.  */
  if (__glibc_unlikely (ex == 0))
    return asfloat (
 sx | hx % hy);
  /* Extract normalized mantissas - hx is not denormal and hy != 0.  */
  uint32_t mx = get_mantissa (
 hx) | (MANTISSA_MASK + 1);
  uint32_t my = get_mantissa (
 hy) | (MANTISSA_MASK + 1);
  int lead_zeros_my = EXPONENT_WIDTH;
  ey--;
  /* Special case for denormal y.  */
  if (__glibc_unlikely (ey < 0))
    {
      my = hy;
      ey = 0;
      exp_diff--;
      lead_zeros_my = __builtin_clz (my);
    }
  int tail_zeros_my = __builtin_ctz (my);
  int sides_zeroes = lead_zeros_my + tail_zeros_my;
  int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
  my >>= right_shift;
  exp_diff -= right_shift;
  ey += right_shift;
  int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH;
  mx <<= left_shift;
  exp_diff -= left_shift;
  mx %= my;
  if (__glibc_unlikely (mx == 0))
    return asfloat (
 sx);
  if (exp_diff == 0)
    return make_float (mx, ey, 
 sx);
  /* Multiplication with the inverse is faster than repeated modulo.  */
  uint32_t inv_hy = UINT32_MAX / my;
  while (exp_diff > sides_zeroes) {
    exp_diff -= sides_zeroes;
    uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes);
    mx <<= sides_zeroes;
    mx -= hd * my;
    while (__glibc_unlikely (mx > my))
      mx -= my;
  }
  uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff);
  mx <<= exp_diff;
  mx -= hd * my;
  while (__glibc_unlikely (mx > my))
    mx -= my;
  return make_float (mx, ey, 
 sx);
 }
	stdfmod:
	movd esi,xmm0
	movd edx,xmm1
	mov eax,esi
	mov ecx,edx
	and eax,0x7fffffff
	and ecx,0x7fffffff
	cmp eax,ecx
	jae 4011a0 <stdfmod+0x30>
	cmp ecx,0x7f800000
	ja 401198 <stdfmod+0x28>
	ret
	nop DWORD PTR [rax+0x0]
	mulss xmm0,xmm1
	ret
	nop DWORD PTR [rax]
	mov r9d,eax
	mov r10d,ecx
	mov edi,esi
	shr r10d,0x17
	shr r9d,0x17
	and edi,0x80000000
	mov r8d,r9d
	lea r11d,[r10-0x18]
	sub r8d,r10d
	cmp r11d,0xde
	ja 401220 <stdfmod+0xb0>
	cmp r8d,0x8
	jg 401390 <stdfmod+0x220>
	shl ecx,0x8
	shl eax,0x8
	mov edx,ecx
	mov ecx,r8d
	or eax,0x80000000
	or edx,0x80000000
	shr edx,cl
	mov ecx,edx
	xor edx,edx
	div ecx
	test edx,edx
	je 4013c1 <stdfmod+0x251>
	bsr ecx,edx
	xor ecx,0x1f
	shl edx,cl
	add ecx,0x1
	shr edx,0x8
	sub r9d,ecx
	shl r9d,0x17
	or edx,edi
	lea eax,[r9+rdx*1]
	movd xmm0,eax
	ret
	nop DWORD PTR [rax+0x0]
	test ecx,ecx
	je 4013e9 <stdfmod+0x279>
	cmp eax,0x7f7fffff
	ja 4013e9 <stdfmod+0x279>
	test r9d,r9d
	je 401401 <stdfmod+0x291>
	and esi,0x7fffff
	mov r11d,0x8
	mov eax,esi
	mov esi,edx
	lea edx,[r10-0x1]
	and esi,0x7fffff
	or eax,0x800000
	or esi,0x800000
	test r10d,r10d
	je 40140c <stdfmod+0x29c>
	push rbp
	movd xmm0,edi
	push rbx
	xor ebx,ebx
	tzcnt ebx,esi
	cmp r8d,ebx
	mov ecx,ebx
	cmovle ecx,r8d
	sub r8d,ecx
	shr esi,cl
	lea r10d,[rdx+rcx*1]
	mov ecx,0x8
	cmp r8d,ecx
	cmovle ecx,r8d
	xor edx,edx
	shl eax,cl
	sub r8d,ecx
	div esi
	mov r9d,edx
	test edx,edx
	je 401373 <stdfmod+0x203>
	test r8d,r8d
	je 4013c6 <stdfmod+0x256>
	mov eax,0xffffffff
	xor edx,edx
	add r11d,ebx
	div esi
	cmp r11d,r8d
	jge 401326 <stdfmod+0x1b6>
	mov ebp,0x20
	sub ebp,r11d
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	cs nop WORD PTR [rax+rax*1+0x0]
	mov edx,r9d
	mov ebx,r9d
	mov ecx,r11d
	sub r8d,r11d
	imul edx,eax
	shl ebx,cl
	mov ecx,ebp
	shr edx,cl
	imul edx,esi
	sub ebx,edx
	mov r9d,ebx
	cmp esi,ebx
	jb 401376 <stdfmod+0x206>
	cmp r11d,r8d
	jl 401300 <stdfmod+0x190>
	mov edx,r9d
	imul r9d,eax
	mov ecx,r8d
	shl edx,cl
	mov ecx,0x20
	sub ecx,r8d
	mov eax,edx
	shr r9d,cl
	imul r9d,esi
	sub eax,r9d
	cmp esi,eax
	jb 401380 <stdfmod+0x210>
	bsr ecx,eax
	mov edx,r10d
	xor ecx,0x1f
	sub ecx,0x8
	shl eax,cl
	sub edx,ecx
	js 401421 <stdfmod+0x2b1>
	test eax,eax
	je 401421 <stdfmod+0x2b1>
	add eax,edi
	shl edx,0x17
	add eax,edx
	movd xmm0,eax
	pop rbx
	pop rbp
	ret
	sub r9d,esi
	cmp esi,r9d
	jae 401321 <stdfmod+0x1b1>
	jmp 401376 <stdfmod+0x206>
	sub eax,esi
	cmp esi,eax
	jae 40134a <stdfmod+0x1da>
	jmp 401380 <stdfmod+0x210>
	nop DWORD PTR [rax+rax*1+0x0]
	cmp eax,0x7f7fffff
	ja 4013e9 <stdfmod+0x279>
	and esi,0x7fffff
	mov r11d,0x8
	mov eax,esi
	mov esi,edx
	lea edx,[r10-0x1]
	and esi,0x7fffff
	or eax,0x800000
	or esi,0x800000
	jmp 40126a <stdfmod+0xfa>
	movd xmm0,edi
	ret
	bsr ecx,edx
	mov eax,r10d
	xor ecx,0x1f
	sub ecx,0x8
	shl edx,cl
	sub eax,ecx
	js 40142d <stdfmod+0x2bd>
	test edx,edx
	je 40142d <stdfmod+0x2bd>
	add edx,edi
	shl eax,0x17
	add eax,edx
	movd xmm0,eax
	jmp 401373 <stdfmod+0x203>
	mulss xmm0,xmm1
	cmp eax,0x7f800000
	ja 401193 <stdfmod+0x23>
	divss xmm0,xmm0
	jmp 401140 <with_errnof.constprop.0>
	xor edx,edx
	div ecx
	or edx,edi
	movd xmm0,edx
	ret
	sub r8d,0x1
	bsr r11d,ecx
	mov esi,ecx
	xor edx,edx
	xor r11d,0x1f
	jmp 40126a <stdfmod+0xfa>
	sub ecx,r10d
	xor edx,edx
	shr eax,cl
	jmp 401368 <stdfmod+0x1f8>
	sub ecx,r10d
	xor eax,eax
	shr edx,cl
	jmp 4013dc <stdfmod+0x26c>
	stdfmod_hotpath:
	movd r8d,xmm0
	movd edx,xmm1
	mov eax,r8d
	mov esi,edx
	and eax,0x7fffffff
	and esi,0x7fffffff
	cmp eax,esi
	jae 401460 <stdfmod_hotpath+0x20>
	ret
	xchg ax,ax
	mov r9d,eax
	mov r11d,esi
	mov edi,r8d
	shr r11d,0x17
	shr r9d,0x17
	and edi,0x80000000
	mov r10d,r9d
	lea ecx,[r11-0x18]
	sub r10d,r11d
	cmp ecx,0xde
	ja 4014d0 <stdfmod_hotpath+0x90>
	cmp r10d,0x8
	jg 4014d0 <stdfmod_hotpath+0x90>
	shl esi,0x8
	mov ecx,r10d
	shl eax,0x8
	xor edx,edx
	or esi,0x80000000
	or eax,0x80000000
	shr esi,cl
	div esi
	bsr ecx,edx
	xor ecx,0x1f
	shl edx,cl
	add ecx,0x1
	shr edx,0x8
	sub r9d,ecx
	shl r9d,0x17
	or edx,edi
	lea eax,[r9+rdx*1]
	movd xmm0,eax
	ret
	nop DWORD PTR [rax+0x0]
	and edx,0x7fffff
	mov eax,r8d
	xor r8d,r8d
	mov esi,edx
	and eax,0x7fffff
	or esi,0x800000
	or eax,0x800000
	tzcnt r8d,esi
	cmp r10d,r8d
	mov ecx,r8d
	cmovle ecx,r10d
	shr esi,cl
	sub r10d,ecx
	lea r11d,[r11+rcx*1-0x1]
	mov ecx,0x8
	cmp r10d,ecx
	cmovle ecx,r10d
	xor edx,edx
	shl eax,cl
	div esi
	mov r9d,edx
	sub r10d,ecx
	je 4015e2 <stdfmod_hotpath+0x1a2>
	mov eax,0xffffffff
	xor edx,edx
	add r8d,0x8
	push rbp
	div esi
	push rbx
	cmp r8d,r10d
	jge 4015a2 <stdfmod_hotpath+0x162>
	mov ebp,0x20
	sub ebp,r8d
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	data16 cs nop WORD PTR [rax+rax*1+0x0]
	nop DWORD PTR [rax+rax*1+0x0]
	mov edx,r9d
	mov ebx,r9d
	mov ecx,r8d
	sub r10d,r8d
	imul edx,eax
	shl ebx,cl
	mov ecx,ebp
	shr edx,cl
	imul edx,esi
	sub ebx,edx
	mov r9d,ebx
	cmp r8d,r10d
	jl 401580 <stdfmod_hotpath+0x140>
	imul eax,r9d
	mov ecx,r10d
	mov edx,r9d
	shl edx,cl
	mov ecx,0x20
	sub ecx,r10d
	shr eax,cl
	imul eax,esi
	sub edx,eax
	mov eax,r11d
	bsr ecx,edx
	xor ecx,0x1f
	sub ecx,0x8
	shl edx,cl
	sub eax,ecx
	js 401604 <stdfmod_hotpath+0x1c4>
	test edx,edx
	je 401604 <stdfmod_hotpath+0x1c4>
	shl eax,0x17
	add edx,eax
	pop rbx
	pop rbp
	lea eax,[rdx+rdi*1]
	movd xmm0,eax
	ret
	bsr ecx,edx
	mov eax,r11d
	xor ecx,0x1f
	sub ecx,0x8
	shl edx,cl
	sub eax,ecx
	js 40160d <stdfmod_hotpath+0x1cd>
	test edx,edx
	je 40160d <stdfmod_hotpath+0x1cd>
	add edx,edi
	shl eax,0x17
	add eax,edx
	movd xmm0,eax
	ret
	sub ecx,r11d
	xor eax,eax
	shr edx,cl
	jmp 4015d6 <stdfmod_hotpath+0x196>
	sub ecx,r11d
	xor eax,eax
	shr edx,cl
	jmp 4015f8 <stdfmod_hotpath+0x1b8>
	/* Floating-point remainder function.
	Copyright (C) 2023-2024 Free Software Foundation, Inc.
	This file is part of the GNU C Library.
	The GNU C Library is free software; you can redistribute it and/or
	modify it under the terms of the GNU Lesser General Public
	License as published by the Free Software Foundation; either
	version 2.1 of the License, or (at your option) any later version.
	The GNU C Library is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	Lesser General Public License for more details.
	You should have received a copy of the GNU Lesser General Public
	License along with the GNU C Library; if not, see
	<https://www.gnu.org/licenses/>. */
	#include <math.h>
	#include <stdint.h>
	#include <stdbool.h>
	#include <errno.h>

	#define BIT_WIDTH 32
	#define MANTISSA_WIDTH 23
	#define EXPONENT_WIDTH 8
	#define MANTISSA_MASK 0x007fffff
	#define EXPONENT_MASK 0x7f800000
	#define EXP_MANT_MASK 0x7fffffff
	#define QUIET_NAN_MASK 0x00400000
	#define SIGN_MASK 0x80000000

	#define NOINLINE __attribute__ ((noinline))

	NOINLINE static float
	with_errnof (float y, int e)
	{
	errno = e;
	return y;
	}

	float
	__math_edomf (float y)
	{
	return with_errnof (y, EDOM);
	}

	static inline bool
	is_nan (uint32_t x)
	{
	return (x & EXP_MANT_MASK) > EXPONENT_MASK;
	}

	static inline uint32_t
	get_mantissa (uint32_t x)
	{
	return x & MANTISSA_MASK;
	}

	static inline float
	asfloat (uint32_t i)
	{
	union
	{
	uint32_t i;
	float f;
	} u = {
	i};
	return u.f;
	}

	/* Convert integer number X, unbiased exponent EP, and sign S to double:
	result = X * 2^(EP+1 - exponent_bias)
	NB: zero is not supported. */
	static inline double
	make_float (uint32_t x, int ep, uint32_t s)
	{
	int lz = __builtin_clz (x) - EXPONENT_WIDTH;
	x <<= lz;
	ep -= lz;
	if (__glibc_unlikely (ep < 0 \|\| x == 0))
	{
	x >>= -ep;
	ep = 0;
	}
	return asfloat (
	s + x + (ep << MANTISSA_WIDTH));
	}

	static inline uint32_t
	asuint (float f)
	{
	union
	{
	float f;
	uint32_t i;
	} u = {f};
	return u.i;
	}
	/* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the
	simplest implementation is:
	mx * 2^ex == 2 * mx * 2^(ex - 1)
	or
	while (ex > ey)
	{
	mx *= 2;
	--ex;
	mx %= my;
	}
	With the mathematical equivalence of:
	r == x % y == (x % (N * y)) % y
	And with mx/my being mantissa of a single floating point number (which uses
	less bits than the storage type), on each step the argument reduction can
	be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus
	the implicit one bit):
	mx * 2^ex == 2^8 * mx * 2^(ex - 8)
	or
	while (ex > ey)
	{
	mx << 8;
	ex -= 8;
	mx %= my;
	}
	Special cases:
	- If x or y is a NaN, a NaN is returned.
	- If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
	- If x is +0/-0, and y is not zero, +0/-0 is returned. */
	float
	stdfmod (float x, float y)
	{
	uint32_t hx = asuint (
	x);
	uint32_t hy = asuint (
	y);
	uint32_t sx = hx & SIGN_MASK;
	/* Get \|x\| and \|y\|. */
	hx ^= sx;
	hy &= ~SIGN_MASK;
	if (__glibc_likely (hx < hy))
	{
	/* If y is a NaN, return a NaN. */
	if (__glibc_unlikely (hy > EXPONENT_MASK))
	return x * y;
	return x;
	}
	int ex = hx >> MANTISSA_WIDTH;
	int ey = hy >> MANTISSA_WIDTH;
	int exp_diff = ex - ey;
	/* Common case where exponents are close: \|x/y\| < 2^9, x not inf/NaN
	and \|x%y\| not denormal. */
	if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
	&& ey > MANTISSA_WIDTH
	&& exp_diff <= EXPONENT_WIDTH))
	{
	uint32_t mx = (hx << EXPONENT_WIDTH) \| SIGN_MASK;
	uint32_t my = (hy << EXPONENT_WIDTH) \| SIGN_MASK;
	mx %= (my >> exp_diff);
	if (__glibc_unlikely (mx == 0))
	return asfloat (
	sx);
	int shift = __builtin_clz (mx);
	ex -= shift + 1;
	mx <<= shift;
	mx = sx \| (mx >> EXPONENT_WIDTH);
	return asfloat (
	mx + ((uint32_t)ex << MANTISSA_WIDTH));
	}
	if (__glibc_unlikely (hy == 0 \|\| hx >= EXPONENT_MASK))
	{
	/* If x is a NaN, return a NaN. */
	if (hx > EXPONENT_MASK)
	return x * y;
	/* If x is an infinity or y is zero, return a NaN and set EDOM. */
	return __math_edomf (
	(x * y) / (x * y));
	}
	/* Special case, both x and y are denormal. */
	if (__glibc_unlikely (ex == 0))
	return asfloat (
	sx \| hx % hy);
	/* Extract normalized mantissas - hx is not denormal and hy != 0. */
	uint32_t mx = get_mantissa (
	hx) \| (MANTISSA_MASK + 1);
	uint32_t my = get_mantissa (
	hy) \| (MANTISSA_MASK + 1);
	int lead_zeros_my = EXPONENT_WIDTH;
	ey--;
	/* Special case for denormal y. */
	if (__glibc_unlikely (ey < 0))
	{
	my = hy;
	ey = 0;
	exp_diff--;
	lead_zeros_my = __builtin_clz (my);
	}
	int tail_zeros_my = __builtin_ctz (my);
	int sides_zeroes = lead_zeros_my + tail_zeros_my;
	int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
	my >>= right_shift;
	exp_diff -= right_shift;
	ey += right_shift;
	int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH;
	mx <<= left_shift;
	exp_diff -= left_shift;
	mx %= my;
	if (__glibc_unlikely (mx == 0))
	return asfloat (
	sx);
	if (exp_diff == 0)
	return make_float (mx, ey,
	sx);
	/* Multiplication with the inverse is faster than repeated modulo. */
	uint32_t inv_hy = UINT32_MAX / my;
	while (exp_diff > sides_zeroes) {
	exp_diff -= sides_zeroes;
	uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes);
	mx <<= sides_zeroes;
	mx -= hd * my;
	while (__glibc_unlikely (mx > my))
	mx -= my;
	}
	uint32_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff);
	mx <<= exp_diff;
	mx -= hd * my;
	while (__glibc_unlikely (mx > my))
	mx -= my;
	return make_float (mx, ey,
	sx);
	}