Test Env | Method | Mean | Error | StdDev | Median | Min | Max | Gen 0 | Gen 1 | Gen 2 | Allocated |
---|---|---|---|---|---|---|---|---|---|---|---|
x64 Windows Native | Sin | 22.82 us | 0.015 us | 0.014 us | 22.82 us | 22.80 us | 22.85 us | - | - | - | - |
x86 Windows Native | Sin | 60.48 us | 0.087 us | 0.068 us | 60.48 us | 60.38 us | 60.60 us | - | - | - | - |
x64 Windows Managed | Sin | 21.26 us | 0.305 us | 0.285 us | 21.27 us | 20.85 us | 21.65 us | - | - | - | - |
x86 Windows Managed | Sin | 38.10 us | 0.390 us | 0.346 us | 38.06 us | 37.66 us | 38.71 us | - | - | - | - |
Last active
September 2, 2022 09:14
-
-
Save tannergooding/103be726d48dc3d0b09e890bad0b892f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is a port of the amd/win-libm implementation provided in assembly here: https://github.com/amd/win-libm/blob/master/sinf.asm | |
// The original source is Copyright (c) 2002-2019 Advanced Micro Devices, Inc. and provided under the MIT License. | |
// | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this Software and associated documentaon files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be included in | |
// all copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
// THE SOFTWARE. | |
const double PiOverTwo = 1.5707963267948966; | |
const double PiOverTwoPartOne = 1.5707963267341256; | |
const double PiOverTwoPartOneTail = 6.077100506506192E-11; | |
const double PiOverTwoPartTwo = 6.077100506303966E-11; | |
const double PiOverTwoPartTwoTail = 2.0222662487959506E-21; | |
const double PiOverFour = 0.7853981633974483; | |
const double TwoOverPi = 0.6366197723675814; | |
const double TwoPowNegSeven = 0.0078125; | |
const double TwoPowNegThirteen = 0.0001220703125; | |
const double C0 = -1.0 / 2.0; // 1 / 2! | |
const double C1 = +1.0 / 24.0; // 1 / 4! | |
const double C2 = -1.0 / 720.0; // 1 / 6! | |
const double C3 = +1.0 / 40320.0; // 1 / 8! | |
const double C4 = -1.0 / 3628800.0; // 1 / 10! | |
const double S1 = -1.0 / 6.0; // 1 / 3! | |
const double S2 = +1.0 / 120.0; // 1 / 5! | |
const double S3 = -1.0 / 5040.0; // 1 / 7! | |
const double S4 = +1.0 / 362880.0; // 1 / 9! | |
private static readonly long[] PiBits = new long[] | |
{ | |
0, | |
5215, | |
13000023176, | |
11362338026, | |
67174558139, | |
34819822259, | |
10612056195, | |
67816420731, | |
57840157550, | |
19558516809, | |
50025467026, | |
25186875954, | |
18152700886 | |
}; | |
public static float Sin(float x) | |
{ | |
double result = x; | |
if (float.IsFinite(x)) | |
{ | |
double ax = Math.Abs(x); | |
if (ax <= PiOverFour) | |
{ | |
if (ax >= TwoPowNegSeven) | |
{ | |
result = SinTaylorSeriesFourIterations(x); | |
} | |
else if (ax >= TwoPowNegThirteen) | |
{ | |
result = SinTaylorSeriesOneIteration(x); | |
} | |
else | |
{ | |
result = x; | |
} | |
} | |
else | |
{ | |
int wasNegative = 0; | |
if (float.IsNegative(x)) | |
{ | |
x = -x; | |
wasNegative = 1; | |
} | |
int region; | |
if (x < 16000000.0) | |
{ | |
// Reduce x to be in the range of -(PI / 4) to (PI / 4), inclusive | |
// This is done by subtracting multiples of (PI / 2). Double-precision | |
// isn't quite accurate enough and introduces some error, but we account | |
// for that using a tail value that helps account for this. | |
long axExp = BitConverter.DoubleToInt64Bits(ax) >> 52; | |
region = (int)(x * TwoOverPi + 0.5); | |
double piOverTwoCount = region; | |
double rHead = x - (piOverTwoCount * PiOverTwoPartOne); | |
double rTail = (piOverTwoCount * PiOverTwoPartOneTail); | |
double r = rHead - rTail; | |
long rExp = (BitConverter.DoubleToInt64Bits(r) << 1) >> 53; | |
if ((axExp - rExp) > 15) | |
{ | |
// The remainder is pretty small compared with x, which implies that x is | |
// near a multiple of (PI / 2). That is, x matches the multiple to at least | |
// 15 bits and so we perform an additional fixup to account for any error | |
r = rHead; | |
rTail = (piOverTwoCount * PiOverTwoPartTwo); | |
rHead = r - rTail; | |
rTail = (piOverTwoCount * PiOverTwoPartTwoTail) - ((r - rHead) - rTail); | |
r = rHead - rTail; | |
} | |
if (rExp >= 0x3F2) // r >= 2^-13 | |
{ | |
if ((region & 1) == 0) // region 0 or 2 | |
{ | |
result = SinTaylorSeriesFourIterations(r); | |
} | |
else // region 1 or 3 | |
{ | |
result = CosTaylorSeriesFourIterations(r); | |
} | |
} | |
else if (rExp > 0x3DE) // r > 1.1641532182693481E-10 | |
{ | |
if ((region & 1) == 0) // region 0 or 2 | |
{ | |
result = SinTaylorSeriesOneIteration(r); | |
} | |
else // region 1 or 3 | |
{ | |
result = CosTaylorSeriesOneIteration(r); | |
} | |
} | |
else | |
{ | |
if ((region & 1) == 0) // region 0 or 2 | |
{ | |
result = r; | |
} | |
else // region 1 or 3 | |
{ | |
result = 1; | |
} | |
} | |
} | |
else | |
{ | |
double r = ReduceForLargeInput(x, out region); | |
if ((region & 1) == 0) // region 0 or 2 | |
{ | |
result = SinTaylorSeriesFourIterations(r); | |
} | |
else // region 1 or 3 | |
{ | |
result = CosTaylorSeriesFourIterations(r); | |
} | |
} | |
region >>= 1; | |
int tmp1 = region & wasNegative; | |
region = ~region; | |
wasNegative = ~wasNegative; | |
int tmp2 = region & wasNegative; | |
if (((tmp1 | tmp2) & 1) == 0) | |
{ | |
// If the original region was 0/1 and arg is negative, then we negate the result. | |
// -or- | |
// If the original region was 2/3 and arg is positive, then we negate the result. | |
result = -result; | |
} | |
} | |
} | |
return (float)result; | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static double CosTaylorSeriesOneIteration(double x1) | |
{ | |
// 1 - (x^2 / 2!) | |
double x2 = x1 * x1; | |
return 1.0 + (x2 * C1); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static double CosTaylorSeriesFourIterations(double x1) | |
{ | |
// 1 - (x^2 / 2!) + (x^4 / 4!) - (x^6 / 6!) + (x^8 / 8!) - (x^10 / 10!) | |
double x2 = x1 * x1; | |
double x4 = x2 * x2; | |
return 1.0 + (x2 * C0) + (x4 * ((C1 + (x2 * C2)) + (x4 * (C3 + (x2 * C4))))); | |
} | |
static unsafe double ReduceForLargeInput(double x, out int region) | |
{ | |
Debug.Assert(!double.IsNegative(x)); | |
// This method simulates multi-precision floating-point | |
// arithmetic and is accurate for all 1 <= x < infinity | |
const int BitsPerIteration = 36; | |
long ux = BitConverter.DoubleToInt64Bits(x); | |
int xExp = (int)(((ux & 0x7FF0000000000000) >> 52) - 1023); | |
ux = ((ux & 0x000FFFFFFFFFFFFF) | 0x0010000000000000) >> 29; | |
// Now ux is the mantissa bit pattern of x as a long integer | |
long mask = 1; | |
mask = (mask << BitsPerIteration) - 1; | |
// Set first and last to the positions of the first and last chunks of (2 / PI) that we need | |
int first = xExp / BitsPerIteration; | |
int resultExp = xExp - (first * BitsPerIteration); | |
// 120 is the theoretical maximum number of bits (actually | |
// 115 for IEEE single precision) that we need to extract | |
// from the middle of (2 / PI) to compute the reduced argument | |
// accurately enough for our purposes | |
int last = first + (120 / BitsPerIteration); | |
// Unroll the loop. This is only correct because we know that bitsper is fixed as 36. | |
long* result = stackalloc long[10]; | |
long u, carry; | |
result[4] = 0; | |
u = PiBits[last] * ux; | |
result[3] = u & mask; | |
carry = u >> BitsPerIteration; | |
u = PiBits[last - 1] * ux + carry; | |
result[2] = u & mask; | |
carry = u >> BitsPerIteration; | |
u = PiBits[last - 2] * ux + carry; | |
result[1] = u & mask; | |
carry = u >> BitsPerIteration; | |
u = PiBits[first] * ux + carry; | |
result[0] = u & mask; | |
// Reconstruct the result | |
int ltb = (int)((((result[0] << BitsPerIteration) | result[1]) >> (BitsPerIteration - 1 - resultExp)) & 7); | |
long mantissa; | |
long nextBits; | |
// determ says whether the fractional part is >= 0.5 | |
bool determ = (ltb & 1) != 0; | |
int i = 1; | |
if (determ) | |
{ | |
// The mantissa is >= 0.5. We want to subtract it from 1.0 by negating all the bits | |
region = ((ltb >> 1) + 1) & 3; | |
mantissa = 1; | |
mantissa = ~(result[1]) & ((mantissa << (BitsPerIteration - resultExp)) - 1); | |
while (mantissa < 0x0000000000010000) | |
{ | |
i++; | |
mantissa = (mantissa << BitsPerIteration) | (~(result[i]) & mask); | |
} | |
nextBits = (~(result[i + 1]) & mask); | |
} | |
else | |
{ | |
region = (ltb >> 1); | |
mantissa = 1; | |
mantissa = result[1] & ((mantissa << (BitsPerIteration - resultExp)) - 1); | |
while (mantissa < 0x0000000000010000) | |
{ | |
i++; | |
mantissa = (mantissa << BitsPerIteration) | result[i]; | |
} | |
nextBits = result[i + 1]; | |
} | |
// Normalize the mantissa. | |
// The shift value 6 here, determined by trial and error, seems to give optimal speed. | |
int bc = 0; | |
while (mantissa < 0x0000400000000000) | |
{ | |
bc += 6; | |
mantissa <<= 6; | |
} | |
while (mantissa < 0x0010000000000000) | |
{ | |
bc++; | |
mantissa <<= 1; | |
} | |
mantissa |= nextBits >> (BitsPerIteration - bc); | |
int rExp = 52 + resultExp - bc - i * BitsPerIteration; | |
// Put the result exponent rexp onto the mantissa pattern | |
u = (rExp + 1023L) << 52; | |
ux = (mantissa & 0x000FFFFFFFFFFFFF) | u; | |
if (determ) | |
{ | |
// If we negated the mantissa we negate x too | |
ux |= unchecked((long)(0x8000000000000000)); | |
} | |
x = BitConverter.Int64BitsToDouble(ux); | |
// x is a double precision version of the fractional part of | |
// (x * (2 / PI)). Multiply x by (PI / 2) in double precision | |
// to get the reduced result. | |
return x * PiOverTwo; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static double SinTaylorSeriesOneIteration(double x1) | |
{ | |
// x - (x^3 / 3!) | |
double x2 = x1 * x1; | |
double x3 = x2 * x1; | |
return x1 + (x3 * S1); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static double SinTaylorSeriesFourIterations(double x1) | |
{ | |
// x - (x^3 / 3!) + (x^5 / 5!) - (x^7 / 7!) + (x^9 / 9!) | |
double x2 = x1 * x1; | |
double x3 = x2 * x1; | |
double x4 = x2 * x2; | |
return x1 + ((S1 + (x2 * S2) + (x4 * (S3 + (x2 * S4)))) * x3); | |
} | |
} |
@adamsitnik, these numbers are from the perf repo benchmark. The only code above is the managed implementation
@tannergooding thanks for the explanation! BTW these numbers are very impressive!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@tannergooding should we add this benchmarks to the performance repo?