-
-
Save bitonic/d0f5a0a44e37d4f0be03d34d47acb6cf to your computer and use it in GitHub Desktop.
// Copyright (c) 2021 Francesco Mazzoli <[email protected]> | |
// | |
// Permission to use, copy, modify, and distribute this software for any | |
// purpose with or without fee is hereby granted, provided that the above | |
// copyright notice and this permission notice appear in all copies. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
// | |
// Branchless, vectorized `atan2f`. Various functions of increasing | |
// performance are presented. The fastest version is 50~ faster than libc | |
// on batch workloads, outputing a result every ~2 clock cycles, compared to | |
// ~110 for libc. The functions all use the same `atan` approximation, and their | |
// max error is around ~1/10000 of a degree. | |
// | |
// They also do not handle inf / -inf | |
// and the origin as an input as they should -- in our case these are a sign | |
// that something is wrong anyway. Moreover, manual_2 does not handle NaN | |
// correctly (it drops them silently), and all the auto_ functions do not | |
// handle -0 correctly. But manual_1 handles everything but +-inf and +-0,+-0 | |
// correctly. | |
// | |
// Tested on a Xeon W-2145, a Skylake processor. Compiled with | |
// | |
// $ clang++ --version | |
// clang version 12.0.1 | |
// $ clang++ -static --std=c++20 -march=skylake -O3 -Wall vectorized-atan2f.cpp -lm -o vectorized-atan2f | |
// | |
// Results: | |
// | |
// $ ./vectorized-atan2f --test-edge-cases 100000 100 1000 | |
// Generating data... done. | |
// Tests will read 824kB and write 412kB (103048 points) | |
// Running 1000 warm ups and 1000 iterations | |
// | |
// baseline: 2.46 s, 0.32GB/s, 105.79 cycles/elem, 129.34 instrs/elem, 1.22 instrs/cycle, 27.76 branches/elem, 7.08% branch misses, 0.47% cache misses, 4.29GHz | |
// auto_1: 0.21 s, 3.75GB/s, 8.29 cycles/elem, 8.38 instrs/elem, 1.01 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.89GHz, 0.000109283deg max error, max error point: -0.563291132, -0.544303775 | |
// auto_2: 97.50ms, 8.19GB/s, 3.79 cycles/elem, 5.38 instrs/elem, 1.42 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.01% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
// auto_3: 75.95ms, 10.52GB/s, 2.95 cycles/elem, 4.13 instrs/elem, 1.40 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
// auto_4: 55.89ms, 14.29GB/s, 2.17 cycles/elem, 3.63 instrs/elem, 1.67 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.01% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
// manual_1: 52.57ms, 15.20GB/s, 2.04 cycles/elem, 3.63 instrs/elem, 1.78 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.03% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
// manual_2: 50.16ms, 15.93GB/s, 1.95 cycles/elem, 3.88 instrs/elem, 1.99 instrs/cycle, 0.13 branches/elem, 0.01% branch misses, 0.02% cache misses, 3.88GHz, 0.000109283deg max error, max error point: -0.854430377, +0.107594967 | |
// | |
// The atan approximation is from sheet 11 of "Approximations for digital | |
// computers", C. Hastings, 1955, a delightful document overall. | |
// | |
// Functions auto_1 to auto_5 are automatically vectorized. manual_1 and manual_2 | |
// are vectorized manually. | |
// | |
// You'll get quite different results with g++. The main problem with g++ is that | |
// it vectorizes less aggressively. However it inserts FMA instructions _more_ | |
// aggresively, which is a plus. clang needs the `fp-contract` pragma | |
// or an explicit `fmaf`. | |
// | |
// Moreover, the | |
#include <cmath> | |
#include <algorithm> | |
#include <iostream> | |
#include <limits> | |
#include <immintrin.h> | |
#include <vector> | |
#include <unistd.h> | |
#include <sys/ioctl.h> | |
#include <linux/perf_event.h> | |
#include <string.h> | |
#include <asm/unistd.h> | |
#include <random> | |
#include <sstream> | |
#define USE_AVX | |
using namespace std; | |
#define NOINLINE __attribute__((noinline)) | |
#define UNUSED __attribute__((unused)) | |
// -------------------------------------------------------------------- | |
// AVX utils | |
#ifdef USE_AVX | |
template<typename A> | |
inline void assert_avx_aligned(const A* ptr) { | |
if (reinterpret_cast<uintptr_t>(ptr) % 32 != 0) { | |
cerr << "Pointer " << ptr << " is not 32-byte aligned, exiting" << endl; | |
exit(EXIT_FAILURE); | |
} | |
} | |
inline void assert_multiple_of_8(size_t num) { | |
if (num % 8 != 0) { | |
cerr << "Array size " << num << " is not a multiple of 8, exiting" << endl; | |
exit(EXIT_FAILURE); | |
} | |
} | |
#endif | |
// -------------------------------------------------------------------- | |
// functions | |
NOINLINE | |
static void atan2_baseline(size_t cases, const float* ys, const float* xs, float* out) { | |
for (size_t i = 0; i < cases; i++) { | |
out[i] = atan2f(ys[i], xs[i]); | |
} | |
} | |
// not tested since it is very slow. | |
NOINLINE UNUSED | |
static void atan2_fpatan(size_t cases, const float* ys, const float* xs, float* out) { | |
for (size_t i = 0; i < cases; i++) { | |
asm ( | |
"flds (%[ys], %[i], 4)\n" | |
"flds (%[xs], %[i], 4)\n" | |
"fpatan\n" | |
"fstps (%[out], %[i], 4)\n" | |
: | |
: [ys]"r"(ys), [xs]"r"(xs), [out]"r"(out), [i]"r"(i) | |
); | |
} | |
} | |
// Polynomial approximation of atan between [-1, 1]. Stated max error ~0.000001rad. | |
// See comment at the beginning of file for source. | |
inline float atan_approximation(float x) { | |
float a1 = 0.99997726f; | |
float a3 = -0.33262347f; | |
float a5 = 0.19354346f; | |
float a7 = -0.11643287f; | |
float a9 = 0.05265332f; | |
float a11 = -0.01172120f; | |
float x_sq = x*x; | |
return | |
x * (a1 + x_sq * (a3 + x_sq * (a5 + x_sq * (a7 + x_sq * (a9 + x_sq * a11))))); | |
} | |
// First automatic version: naive translation of the maths | |
NOINLINE | |
static void atan2_auto_1(size_t num_points, const float* ys, const float* xs, float* out) { | |
for (size_t i = 0; i < num_points; i++) { | |
// Ensure input is in [-1, +1] | |
float y = ys[i]; | |
float x = xs[i]; | |
bool swap = fabs(x) < fabs(y); | |
float atan_input = (swap ? x : y) / (swap ? y : x); | |
// Approximate atan | |
float res = atan_approximation(atan_input); | |
// If swapped, adjust atan output | |
res = swap ? (atan_input >= 0.0f ? M_PI_2 : -M_PI_2) - res : res; | |
// Adjust quadrants | |
if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant | |
else if (x < 0.0f && y >= 0.0f) { res = M_PI + res; } // 2nd quadrant | |
else if (x < 0.0f && y < 0.0f) { res = -M_PI + res; } // 3rd quadrant | |
else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant | |
// Store result | |
out[i] = res; | |
} | |
} | |
// Second automatic version: get rid of casting to double | |
NOINLINE | |
static void atan2_auto_2(size_t num_points, const float* ys, const float* xs, float* out) { | |
float pi = M_PI; | |
float pi_2 = M_PI_2; | |
for (size_t i = 0; i < num_points; i++) { | |
// Ensure input is in [-1, +1] | |
float y = ys[i]; | |
float x = xs[i]; | |
bool swap = fabs(x) < fabs(y); | |
float atan_input = (swap ? x : y) / (swap ? y : x); | |
// Approximate atan | |
float res = atan_approximation(atan_input); | |
// If swapped, adjust atan output | |
res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res; | |
// Adjust quadrants | |
if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant | |
else if (x < 0.0f && y >= 0.0f) { res = pi + res; } // 2nd quadrant | |
else if (x < 0.0f && y < 0.0f) { res = -pi + res; } // 3rd quadrant | |
else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant | |
// Store result | |
out[i] = res; | |
} | |
} | |
// Third automatic version: perform positive check for x and y once -- the compiler | |
// can't assume that in the presence of NaNs since `0/0 >= 0` is false and `0/0 < 0` is also | |
// false. | |
NOINLINE | |
static void atan2_auto_3(size_t num_points, const float* ys, const float* xs, float* out) { | |
float pi = M_PI; | |
float pi_2 = M_PI_2; | |
for (size_t i = 0; i < num_points; i++) { | |
// Ensure input is in [-1, +1] | |
float y = ys[i]; | |
float x = xs[i]; | |
bool swap = fabs(x) < fabs(y); | |
float atan_input = (swap ? x : y) / (swap ? y : x); | |
// Approximate atan | |
float res = atan_approximation(atan_input); | |
// If swapped, adjust atan output | |
res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res; | |
// Adjust the result depending on the input quadrant | |
if (x < 0.0f) { | |
res = (y >= 0.0f ? pi : -pi) + res; | |
} | |
// Store result | |
out[i] = res; | |
} | |
} | |
inline float atan_fma_approximation(float x) { | |
float a1 = 0.99997726f; | |
float a3 = -0.33262347f; | |
float a5 = 0.19354346f; | |
float a7 = -0.11643287f; | |
float a9 = 0.05265332f; | |
float a11 = -0.01172120f; | |
// Compute approximation using Horner's method | |
float x_sq = x*x; | |
return | |
x * fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, a11, a9), a7), a5), a3), a1); | |
} | |
// Fifth automatic version: use FMA for the polynomial | |
NOINLINE | |
static void atan2_auto_4(size_t num_points, const float* ys, const float* xs, float* out) { | |
float pi = M_PI; | |
float pi_2 = M_PI_2; | |
for (size_t i = 0; i < num_points; i++) { | |
// Ensure input is in [-1, +1] | |
float y = ys[i]; | |
float x = xs[i]; | |
bool swap = fabs(x) < fabs(y); | |
float atan_input = (swap ? x : y) / (swap ? y : x); | |
// Approximate atan | |
float res = atan_fma_approximation(atan_input); | |
// If swapped, adjust atan output | |
res = swap ? copysignf(pi_2, atan_input) - res : res; | |
// Adjust the result depending on the input quadrant | |
if (x < 0.0f) { | |
res = copysignf(pi, y) + res; | |
} | |
// Store result | |
out[i] = res; | |
} | |
} | |
#ifdef USE_AVX | |
inline __m256 atan_avx_approximation(__m256 x) { | |
__m256 a1 = _mm256_set1_ps( 0.99997726f); | |
__m256 a3 = _mm256_set1_ps(-0.33262347f); | |
__m256 a5 = _mm256_set1_ps( 0.19354346f); | |
__m256 a7 = _mm256_set1_ps(-0.11643287f); | |
__m256 a9 = _mm256_set1_ps( 0.05265332f); | |
__m256 a11 = _mm256_set1_ps(-0.01172120f); | |
__m256 x_sq = _mm256_mul_ps(x, x); | |
__m256 result; | |
result = a11; | |
result = _mm256_fmadd_ps(x_sq, result, a9); | |
result = _mm256_fmadd_ps(x_sq, result, a7); | |
result = _mm256_fmadd_ps(x_sq, result, a5); | |
result = _mm256_fmadd_ps(x_sq, result, a3); | |
result = _mm256_fmadd_ps(x_sq, result, a1); | |
result = _mm256_mul_ps(x, result); | |
return result; | |
} | |
// First manual version: straightfoward translation of atan2_auto_5 | |
NOINLINE | |
static void atan2_manual_1(size_t num_points, const float* ys, const float* xs, float* out) { | |
// Check that the input plays well with AVX | |
assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out); | |
// Store pi and pi/2 as constants | |
const __m256 pi = _mm256_set1_ps(M_PI); | |
const __m256 pi_2 = _mm256_set1_ps(M_PI_2); | |
// Create bit masks that we will need. | |
// The first one is all 1s except from the sign bit: | |
// | |
// 01111111111111111111111111111111 | |
// | |
// We can use it to make a float absolute by AND'ing with it. | |
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));; | |
// The second is only the sign bit: | |
// | |
// 10000000000000000000000000000000 | |
// | |
// we can use it to extract the sign of a number by AND'ing with it. | |
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
for (size_t i = 0; i < num_points; i += 8) { | |
// Load 8 elements from `ys` and `xs` into two vectors. | |
__m256 y = _mm256_load_ps(&ys[i]); | |
__m256 x = _mm256_load_ps(&xs[i]); | |
// Compare |y| > |x|. This will create an 8-vector of floats that we can | |
// use as a mask: the elements where the respective comparison is true will be filled | |
// with 1s, with 0s where the comparison is false. | |
// | |
// For example, if we were comparing the vectors | |
// | |
// 5 -5 5 -5 5 -5 5 -5 | |
// > | |
// -5 5 -5 5 -5 5 -5 5 | |
// = | |
// 1s 0s 1s 0s 1s 0s 1s 0s | |
// | |
// Where `1s = 0xFFFFFFFF` and `0s = 0x00000000`. | |
__m256 swap_mask = _mm256_cmp_ps( | |
_mm256_and_ps(y, abs_mask), // |y| | |
_mm256_and_ps(x, abs_mask), // |x| | |
_CMP_GT_OS | |
); | |
// Create the atan input by "blending" `y` and `x`, according to the mask computed | |
// above. The blend instruction will pick the first or second argument based on | |
// the mask we passed in. In our case we need the number of larger magnitude to | |
// be the denominator. | |
__m256 atan_input = _mm256_div_ps( | |
_mm256_blendv_ps(y, x, swap_mask), // pick the lowest between |y| and |x| for each number | |
_mm256_blendv_ps(x, y, swap_mask) // and the highest. | |
); | |
// Approximate atan | |
__m256 result = atan_avx_approximation(atan_input); | |
// If swapped, adjust atan output. We use blending again to leave | |
// the output unchanged if we didn't swap anything. | |
// | |
// If we need to adjust it, we simply carry the sign over from the input | |
// to `pi_2` by using the `sign_mask`. This avoids a more expensive comparison, | |
// and also handles edge cases such as -0 better. | |
result = _mm256_blendv_ps( | |
result, | |
_mm256_sub_ps( | |
_mm256_or_ps(pi_2, _mm256_and_ps(atan_input, sign_mask)), | |
result | |
), | |
swap_mask | |
); | |
// Adjust the result depending on the input quadrant. | |
// | |
// We create a mask for the sign of `x` using an arithmetic right shift: | |
// the mask will be all 0s if the sign if positive, and all 1s | |
// if the sign is negative. | |
__m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31)); | |
// Then use the mask to perform the adjustment only when the sign | |
// if positive, and use the sign bit of `y` to know whether to add | |
// `pi` or `-pi`. | |
result = _mm256_add_ps( | |
_mm256_and_ps( | |
_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), | |
x_sign_mask | |
), | |
result | |
); | |
// Store result | |
_mm256_store_ps(&out[i], result); | |
} | |
} | |
// Second manual version: use the abs values we get at the beginning | |
// for more ILP (see comment below) | |
NOINLINE | |
static void atan2_manual_2(size_t num_points, const float* ys, const float* xs, float* out) { | |
assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out); | |
const __m256 pi = _mm256_set1_ps(M_PI); | |
const __m256 pi_2 = _mm256_set1_ps(M_PI_2); | |
// no sign bit -- AND with this to get absolute value | |
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));; | |
// only sign bit -- XOR with this to get negative | |
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); | |
for (size_t i = 0; i < num_points; i += 8) { | |
__m256 y = _mm256_load_ps(&ys[i]); | |
__m256 x = _mm256_load_ps(&xs[i]); | |
__m256 abs_y = _mm256_and_ps(abs_mask, y); | |
__m256 abs_x = _mm256_and_ps(abs_mask, x); | |
// atan_input = min(|y|, |x|) / max(|y|, |x|) | |
// | |
// I've experimented with using blend rather than min, given that we | |
// compute `swap_mask` later anyway, but that delays the atan | |
// computation by 2+ cycles, and is less parallel, since on skylake: | |
// | |
// latency throughput | |
// vminps 4 0.5 | |
// vmaxps 4 0.5 | |
// vblendvps 2 0.66 | |
// vcmpps 4 0.5 | |
// | |
// So while it decreases the numbers of instructions needed it makes | |
// the overall function slower. | |
// | |
// This has the unfortunate effect of silently dropping NaNs in the | |
// inputs, since in case of NaNs these instructions just return | |
// the second argument. | |
__m256 atan_input = _mm256_div_ps( | |
_mm256_min_ps(abs_x, abs_y), _mm256_max_ps(abs_x, abs_y) | |
); | |
// Approximate atan | |
__m256 result = atan_avx_approximation(atan_input); | |
// We first do the usual +- pi - res, but in this case | |
// we know the sign of pi since the input is always positive. | |
// | |
// result = (abs_y > abs_x) ? pi - res : res; | |
__m256 swap_mask = _mm256_cmp_ps(abs_y, abs_x, _CMP_GT_OQ); | |
result = _mm256_add_ps( | |
_mm256_xor_ps(result, _mm256_and_ps(sign_mask, swap_mask)), | |
_mm256_and_ps(pi_2, swap_mask) | |
); | |
// We now have to adjust the quadrant slightly differently: | |
// | |
// * If both values are positive, do nothing; | |
// * If y is negative and x is positive, do nothing; | |
// * If y is positive and x is negative, do pi + result; | |
// * If y is negative and x is negative, do result - pi; | |
// | |
// These can easily be verified by analyzing what happens to the output | |
// for each input quadrant. | |
// | |
// These cases can be compressed to the two branches below, and then | |
// made branchless without using any blend instruction. | |
// result = (x < 0) ? pi - res : res; | |
__m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31)); | |
result = _mm256_add_ps( | |
_mm256_xor_ps(result, _mm256_and_ps(x, sign_mask)), | |
_mm256_and_ps(pi, x_sign_mask) | |
); | |
// result = (y < 0) ? -res : res; | |
result = _mm256_xor_ps(result, _mm256_and_ps(y, sign_mask)); | |
// Store result | |
_mm256_store_ps(&out[i], result); | |
} | |
} | |
#endif | |
// -------------------------------------------------------------------- | |
// data generation | |
NOINLINE | |
static void generate_data( | |
bool random_points, | |
bool test_edge_cases, | |
size_t desired_num_cases, | |
size_t* cases, | |
float** ys, | |
float** xs, | |
float** ref_out, | |
float** out | |
) { | |
cout << "Generating data..." << flush; | |
size_t edge = static_cast<size_t>(sqrtf(static_cast<float>(desired_num_cases))); | |
vector<float> extra_cases; | |
if (test_edge_cases) { | |
// We want to make sure to test some special cases, so we always add them. | |
// We do not include negative zero, since we do not care about the sign | |
// matching up in that case. | |
// We also do not include NaN and inf, since we assume that the input does | |
// not contain it. | |
using limits = numeric_limits<float>; | |
extra_cases = { | |
1.0f, -1.0f, 0.0f, -0.0f, | |
limits::epsilon(), -limits::epsilon(), | |
limits::max(), -limits::max(), | |
limits::quiet_NaN() | |
}; | |
} | |
// The -1 is for the 0, 0 case, which we do not want. | |
*cases = (edge+extra_cases.size())*(edge+extra_cases.size()) - 1; | |
// Make sure cases is a multiple of 8 so that the AVX functions can work cleanly | |
*cases = *cases + (8 - (*cases % 8)); | |
*ys = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
*xs = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
*ref_out = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
*out = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float))); | |
if (*ys == nullptr || *xs == nullptr || *ref_out == nullptr || *out == nullptr) { | |
cerr << "Could not allocate arrays" << endl; | |
exit(EXIT_FAILURE); | |
} | |
mt19937_64 gen(0); | |
{ | |
float bound = 1.0f; | |
float step = (bound * 2.0f) / static_cast<float>(edge); | |
uniform_real_distribution<float> dist{ -1.0f, 1.0f }; | |
const auto get_number = [random_points, &extra_cases, &gen, bound, step, &dist](size_t i) -> float { | |
if (i < extra_cases.size()) { | |
return extra_cases.at(i); | |
} else if (random_points) { | |
return dist(gen); | |
} else { | |
return -bound + static_cast<float>(i - extra_cases.size()) * step; | |
} | |
}; | |
size_t ix = 0; | |
for (size_t i = 0; i < edge + extra_cases.size(); i++) { | |
for (size_t j = 0; j < edge + extra_cases.size(); j++) { | |
float y = get_number(i); | |
float x = get_number(j); | |
if (y == 0.0f && x == 0.0f) { continue; } | |
(*ys)[ix] = y; | |
(*xs)[ix] = x; | |
ix++; | |
} | |
} | |
// pad with dummies | |
for (; ix < *cases; ix++) { | |
(*ys)[ix] = 1.0f; | |
(*xs)[ix] = 1.0f; | |
} | |
} | |
// shuffle to confuse branch predictor in the case of explicit branches and | |
// non-random points | |
{ | |
for (size_t i = 0; i < *cases - 1; i++) { | |
size_t swap_with = static_cast<size_t>(i + 1 + (gen() % (*cases - i - 1))); | |
swap((*ys)[i], (*ys)[swap_with]); | |
swap((*xs)[i], (*xs)[swap_with]); | |
} | |
} | |
cout << " done." << endl; | |
} | |
// -------------------------------------------------------------------- | |
// Pin to first CPU | |
static void pin_to_cpu_0(void) { | |
cpu_set_t cpu_mask; | |
CPU_ZERO(&cpu_mask); | |
CPU_SET(0, &cpu_mask); | |
if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) { | |
fprintf(stderr, "Could not set CPU affinity\n"); | |
exit(EXIT_FAILURE); | |
} | |
} | |
// -------------------------------------------------------------------- | |
// perf instrumentation -- a mixture of man 3 perf_event_open and | |
// <https://stackoverflow.com/a/42092180> | |
static long | |
perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { | |
int ret; | |
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); | |
return ret; | |
} | |
static void setup_perf_event( | |
struct perf_event_attr *evt, | |
int *fd, | |
uint64_t *id, | |
uint32_t evt_type, | |
uint64_t evt_config, | |
int group_fd | |
) { | |
memset(evt, 0, sizeof(struct perf_event_attr)); | |
evt->type = evt_type; | |
evt->size = sizeof(struct perf_event_attr); | |
evt->config = evt_config; | |
evt->disabled = 1; | |
evt->exclude_kernel = 1; | |
evt->exclude_hv = 1; | |
evt->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; | |
*fd = perf_event_open(evt, 0, -1, group_fd, 0); | |
if (*fd == -1) { | |
fprintf(stderr, "Error opening leader %llx\n", evt->config); | |
exit(EXIT_FAILURE); | |
} | |
ioctl(*fd, PERF_EVENT_IOC_ID, id); | |
} | |
static struct perf_event_attr perf_cycles_evt; | |
static int perf_cycles_fd; | |
static uint64_t perf_cycles_id; | |
static struct perf_event_attr perf_clock_evt; | |
static int perf_clock_fd; | |
static uint64_t perf_clock_id; | |
static struct perf_event_attr perf_instrs_evt; | |
static int perf_instrs_fd; | |
static uint64_t perf_instrs_id; | |
static struct perf_event_attr perf_cache_misses_evt; | |
static int perf_cache_misses_fd; | |
static uint64_t perf_cache_misses_id; | |
static struct perf_event_attr perf_cache_references_evt; | |
static int perf_cache_references_fd; | |
static uint64_t perf_cache_references_id; | |
static struct perf_event_attr perf_branch_misses_evt; | |
static int perf_branch_misses_fd; | |
static uint64_t perf_branch_misses_id; | |
static struct perf_event_attr perf_branch_instructions_evt; | |
static int perf_branch_instructions_fd; | |
static uint64_t perf_branch_instructions_id; | |
static void perf_init(void) { | |
// Cycles | |
setup_perf_event( | |
&perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, -1 | |
); | |
// Clock | |
setup_perf_event( | |
&perf_clock_evt, &perf_clock_fd, &perf_clock_id, | |
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd | |
); | |
// Instructions | |
setup_perf_event( | |
&perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd | |
); | |
// Cache misses | |
setup_perf_event( | |
&perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd | |
); | |
// Cache references | |
setup_perf_event( | |
&perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd | |
); | |
// Branch misses | |
setup_perf_event( | |
&perf_branch_misses_evt, &perf_branch_misses_fd, &perf_branch_misses_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, perf_cycles_fd | |
); | |
// Branch instructions | |
setup_perf_event( | |
&perf_branch_instructions_evt, &perf_branch_instructions_fd, &perf_branch_instructions_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, perf_cycles_fd | |
); | |
} | |
static void perf_close(void) { | |
close(perf_clock_fd); | |
close(perf_cycles_fd); | |
close(perf_instrs_fd); | |
close(perf_cache_misses_fd); | |
close(perf_cache_references_fd); | |
} | |
static void disable_perf_count(void) { | |
ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); | |
} | |
static void enable_perf_count(void) { | |
ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); | |
} | |
static void reset_perf_count(void) { | |
ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); | |
} | |
struct perf_read_value { | |
uint64_t value; | |
uint64_t id; | |
}; | |
struct perf_read_format { | |
uint64_t nr; | |
struct perf_read_value values[]; | |
}; | |
static char perf_read_buf[4096]; | |
struct perf_count { | |
uint64_t cycles; | |
double seconds; | |
uint64_t instructions; | |
uint64_t cache_misses; | |
uint64_t cache_references; | |
uint64_t branch_misses; | |
uint64_t branch_instructions; | |
perf_count(): cycles(0), seconds(0.0), instructions(0), cache_misses(0), cache_references(0), branch_misses(0), branch_instructions(0) {} | |
}; | |
static void read_perf_count(struct perf_count *count) { | |
if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) { | |
fprintf(stderr, "Could not read cycles from perf\n"); | |
exit(EXIT_FAILURE); | |
} | |
struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf; | |
if (rf->nr != 7) { | |
fprintf(stderr, "Bad number of perf events\n"); | |
exit(EXIT_FAILURE); | |
} | |
for (int i = 0; i < static_cast<int>(rf->nr); i++) { | |
struct perf_read_value *value = &rf->values[i]; | |
if (value->id == perf_cycles_id) { | |
count->cycles = value->value; | |
} else if (value->id == perf_clock_id) { | |
count->seconds = ((double) (value->value / 1000ull)) / 1000000.0; | |
} else if (value->id == perf_instrs_id) { | |
count->instructions = value->value; | |
} else if (value->id == perf_cache_misses_id) { | |
count->cache_misses = value->value; | |
} else if (value->id == perf_cache_references_id) { | |
count->cache_references = value->value; | |
} else if (value->id == perf_branch_misses_id) { | |
count->branch_misses = value->value; | |
} else if (value->id == perf_branch_instructions_id) { | |
count->branch_instructions = value->value; | |
} else { | |
fprintf(stderr, "Spurious value in perf read (%ld)\n", value->id); | |
exit(EXIT_FAILURE); | |
} | |
} | |
} | |
// -------------------------------------------------------------------- | |
// running the function | |
struct max_error { | |
float y; | |
float x; | |
float error; | |
max_error(): y(0.0f), x(0.0f), error(0.0f) {} | |
void update(float y, float x, float reference, float result) { | |
if (isnan(reference) && !isnan(result)) { | |
cerr << "Expected NaN in result, but got " << result << endl; | |
cerr << "For point " << x << ", " << y << endl; | |
exit(EXIT_FAILURE); | |
} | |
if (!isnan(reference) && isnan(result)) { | |
cerr << "Unexpected NaN in result, expected " << reference << endl; | |
cerr << "For point " << x << ", " << y << endl; | |
exit(EXIT_FAILURE); | |
} | |
float error = abs(reference - result); | |
if (error > this->error) { | |
this->y = y; | |
this->x = x; | |
this->error = error; | |
} | |
} | |
}; | |
NOINLINE | |
static void run_timed( | |
int pre_iterations, | |
int iterations, | |
bool test_negative_zero, | |
bool test_max, | |
bool test_nan, | |
size_t num, | |
const string& name, | |
void (*fun)(size_t, const float*, const float*, float*), | |
const float* ref_out, // if null no comparison will be run | |
const float* in_y, | |
const float* in_x, | |
float* out | |
) { | |
if (iterations < 1) { | |
fprintf(stderr, "iterations < 1: %d\n", iterations); | |
exit(EXIT_FAILURE); | |
} | |
for (int i = 0; i < pre_iterations; i++) { | |
fun(num, in_y, in_x, out); | |
} | |
struct perf_count counts; | |
disable_perf_count(); | |
reset_perf_count(); | |
enable_perf_count(); | |
// The enabling / disabling adds noise for small timings, so | |
// wrap the loop since the loop counts for very little (one add, one cmp, one | |
// predicted conditional jump). | |
for (int i = 0; i < iterations; i++) { | |
fun(num, in_y, in_x, out); | |
} | |
disable_perf_count(); | |
read_perf_count(&counts); | |
// two floats per element | |
uint64_t bytes = ((uint64_t) num) * sizeof(float) * 2; | |
double gb_per_s = (((double) bytes) / 1000000000.0) / (counts.seconds / ((double) iterations)); | |
double time = counts.seconds; | |
const char *unit = " s"; | |
if (time < 0.1) { | |
time *= 1000.0; | |
unit = "ms"; | |
} | |
if (time < 0.1) { | |
time *= 1000.0; | |
unit = "us"; | |
} | |
constexpr int padded_name_size = 10; | |
char padded_name[padded_name_size]; | |
int name_len = snprintf(padded_name, padded_name_size, "%s:", name.c_str()); | |
for (int i = name_len; i < padded_name_size; i++) { | |
padded_name[i] = ' '; | |
} | |
padded_name[padded_name_size-1] = '\0'; | |
printf( | |
"%s %5.2f%s, %6.2fGB/s, %6.2f cycles/elem, %6.2f instrs/elem, %5.2f instrs/cycle, %5.2f branches/elem, %5.2f%% branch misses, %5.2f%% cache misses, %5.2fGHz", | |
padded_name, time, unit, gb_per_s, | |
((double) counts.cycles) / ((double) iterations) / ((double) num), | |
((double) counts.instructions) / ((double) iterations) / ((double) num), | |
((double) counts.instructions) / ((double) counts.cycles), | |
((double) counts.branch_instructions) / ((double) iterations) / ((double) num), | |
100.0 * ((double) counts.branch_misses) / ((double) counts.branch_instructions), | |
100.0 * ((double) counts.cache_misses) / ((double) counts.cache_references), | |
((double) counts.cycles) / counts.seconds / 1000000000.0 | |
); | |
if (ref_out != nullptr) { | |
max_error max_error; | |
for (size_t ix = 0; ix < num; ix++) { | |
float y = in_y[ix]; | |
float x = in_x[ix]; | |
if (!test_negative_zero && (y == -0.0f || x == -0.0f)) { continue; } | |
if ( | |
!test_max && | |
(fabs(y) == numeric_limits<float>::max() || fabs(x) == numeric_limits<float>::max()) | |
) { continue; } | |
if (!test_nan && (isnan(y) || isnan(x))) { continue; } | |
max_error.update(in_y[ix], in_x[ix], ref_out[ix], out[ix]); | |
} | |
printf( | |
", %.9fdeg max error, max error point: %+.9f, %+.9f\n", | |
max_error.error * 180.0f / M_PI, max_error.x, max_error.y | |
); | |
} else { | |
printf("\n"); | |
} | |
} | |
#define run_timed_atan2(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, fun, ref_out, in_y, in_x, out) \ | |
run_timed(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, #fun, atan2_ ## fun, ref_out, in_y, in_x, out) | |
// -------------------------------------------------------------------- | |
// main | |
NOINLINE | |
static pair<string, size_t> format_size(size_t size) { | |
string unit = "B"; | |
if (size > 10000ull) { | |
size /= 1000ull; | |
unit = "kB"; | |
} | |
if (size > 10000ull) { | |
size /= 1000ull; | |
unit = "MB"; | |
} | |
if (size > 10000ull) { | |
size /= 1000ull; | |
unit = "GB"; | |
} | |
return { unit, size }; | |
} | |
struct config { | |
bool random_points = false; | |
bool test_edge_cases = false; | |
bool test_baseline = true; | |
bool test_auto_1 = true; | |
bool test_auto_2 = true; | |
bool test_auto_3 = true; | |
bool test_auto_4 = true; | |
bool test_manual_1 = true; | |
bool test_manual_2 = true; | |
}; | |
int main(int argc, const char* argv[]) { | |
cout.precision(numeric_limits<float>::max_digits10); | |
pin_to_cpu_0(); | |
perf_init(); | |
config config; | |
const auto bad_usage = [&argv]() { | |
cerr << "Usage: " << argv[0] << " [--random-points] [--test-edge-cases] [--functions COMMA_SEPARATED_FUNCTIONS] NUMBER_OF_TEST_CASES NUMBER_OF_WARM_UPS NUMBER_OF_ITERATIONS" << endl; | |
exit(EXIT_FAILURE); | |
}; | |
vector<string> arguments; | |
for (int i = 1; i < argc; i++) { | |
string arg{argv[i]}; | |
if (arg == "--random-points") { | |
config.random_points = true; | |
} else if (arg == "--test-edge-cases") { | |
config.test_edge_cases = true; | |
} else if (arg == "--functions") { | |
config.test_baseline = false; | |
config.test_auto_1 = false; | |
config.test_auto_2 = false; | |
config.test_auto_3 = false; | |
config.test_auto_4 = false; | |
config.test_manual_1 = false; | |
config.test_manual_2 = false; | |
if (i < argc - 1) { | |
i++; | |
stringstream functions{ argv[i] }; | |
for (string function; getline(functions, function, ','); ) { | |
if (function == "baseline") { | |
config.test_baseline = true; | |
} else if (function == "auto_1") { | |
config.test_auto_1 = true; | |
} else if (function == "auto_2") { | |
config.test_auto_2 = true; | |
} else if (function == "auto_3") { | |
config.test_auto_3 = true; | |
} else if (function == "auto_4") { | |
config.test_auto_4 = true; | |
} | |
#ifdef USE_AVX | |
else if (function == "manual") { | |
config.test_manual_1 = true; | |
} else if (function == "manual_2") { | |
config.test_manual_2 = true; | |
} | |
#endif | |
else { | |
cerr << "Bad function " << function << endl; | |
bad_usage(); | |
} | |
} | |
} else { | |
cerr << "Expecting argument after --functions" << endl; | |
bad_usage(); | |
} | |
} else { | |
arguments.emplace_back(arg); | |
} | |
} | |
if (arguments.size() != 3) { | |
bad_usage(); | |
} | |
size_t desired_cases; | |
if (sscanf(arguments.at(0).c_str(), "%zu", &desired_cases) != 1) { | |
cerr << "Could not parse number of cases" << endl; | |
bad_usage(); | |
} | |
int pre_iterations; | |
if (sscanf(arguments.at(1).c_str(), "%d", &pre_iterations) != 1) { | |
cerr << "Could not parse warm ups" << endl; | |
bad_usage(); | |
} | |
int iterations; | |
if (sscanf(arguments.at(2).c_str(), "%d", &iterations) != 1) { | |
cerr << "Could not parse iterations" << endl; | |
bad_usage(); | |
} | |
size_t cases; | |
float* ys; | |
float* xs; | |
float* ref_out; | |
float* out; | |
generate_data( | |
config.random_points, config.test_edge_cases, | |
desired_cases, | |
&cases, &ys, &xs, | |
&ref_out, | |
&out | |
); | |
if (!config.test_baseline) { | |
free(ref_out); | |
ref_out = nullptr; | |
} | |
const auto formatted_input_size = format_size(sizeof(float) * 2 * cases); | |
const auto formatted_output_size = format_size(sizeof(float) * cases); | |
cout << "Tests will read " << formatted_input_size.second << formatted_input_size.first << " and write " << formatted_output_size.second << formatted_output_size.first << " (" << cases << " points)" << endl; | |
cout << "Running " << pre_iterations << " warm ups and " << iterations << " iterations" << endl; | |
cout << endl; | |
if (config.test_baseline) { | |
run_timed(pre_iterations, iterations, true, true, true, cases, "baseline", atan2_baseline, nullptr, ys, xs, ref_out); | |
} | |
// auto_1 to auto_3 do not support the max value because the input gets | |
// reduce to negative zero in that case and the atan_input >= 0 branch | |
// doesn't preserve the sign properly. | |
// | |
// Similarly, none of the functions apart from manual_1 and manual_2 support | |
// negative zero properly because of the x < 0 branch. | |
// | |
// manual_2 silently drop NaNs, it can be made to not do that at slight | |
// performance hit (see comment in it). | |
if (config.test_auto_1) { | |
run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_1, ref_out, ys, xs, out); | |
} | |
if (config.test_auto_2) { | |
run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_2, ref_out, ys, xs, out); | |
} | |
if (config.test_auto_3) { | |
run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_3, ref_out, ys, xs, out); | |
} | |
if (config.test_auto_4) { | |
run_timed_atan2(pre_iterations, iterations, false, true, true, cases, auto_4, ref_out, ys, xs, out); | |
} | |
#ifdef USE_AVX | |
if (config.test_manual_1) { | |
run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_1, ref_out, ys, xs, out); | |
} | |
if (config.test_manual_2) { | |
run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_2, ref_out, ys, xs, out); | |
} | |
#endif | |
cout << endl; | |
free(ys); free(xs); free(ref_out); free(out); | |
perf_close(); | |
return EXIT_SUCCESS; | |
} |
@bitonic , Hi I am using ubuntu 20.04 and I built your vectorized-atan2f.cpp with clang++-12 and run ./vectorized-atan2f but always got the below error:
Error opening leader 0.
Don't know how to resolve this :(
I have installed perf tool in my ubuntu.
Hey,
for the case y < 0, x = 0
atan2_auto_3
returns Pi/2
, but the result should be -Pi/2
.
A fun problem I ran into using code derived from atan2_auto_4(): it is wrong when x is negative zero. Say if y=1 and x=-0, then atan_input is -0 and atan_fma_approximation() returns -0. The quadrant adjustment on line 260 propagates the sign from the -0 giving res=-pi/2, and then the condition x < 0.0f
is false so -pi/2 is the final result. This is a major discontinuity since the correct result is pi/2. To fix it, I believe you can change line 262 from:
if (x < 0.0f) {
to
if (signbit(x)) {
I figured out why I was getting no perf events.
From the manual on perf_event_open(2)
Turns out there was one too many hardware counters for my CPU so none of them returned values, but you also don't get any errors. (Aside: though I was getting a bogus value out of the software task-clock counter, which is weird)
Got a similar result from
perf
but it correctly counts as many as it can and then says<not counted>
And after the bit about
nmi_watchdog
, I did let me get results from all the counters.I'll have to look later why/how
perf
detects that there are too many counters and/or detecting the number of available ones beforehand.