Created
January 9, 2017 04:56
-
-
Save inflation/6d1120a8889c8fbeff65955331fe62cd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <chrono> | |
#include <thread> | |
#include <dispatch/dispatch.h> // macOS only | |
#include <immintrin.h> | |
template<typename TimeT = std::chrono::milliseconds> | |
struct measure | |
{ | |
template<typename F, typename ...Args> | |
static auto duration(F&& func, Args&&... args) | |
{ | |
auto start = std::chrono::steady_clock::now(); | |
std::forward<decltype(func)>(func)(std::forward<Args>(args)...); | |
return std::chrono::duration_cast<TimeT>(std::chrono::steady_clock::now()-start); | |
} | |
}; | |
inline float Q_rsqrt( float number ) | |
{ | |
long i; | |
float x2, y; | |
const float threehalfs = 1.5F; | |
x2 = number * 0.5F; | |
y = number; | |
i = * ( long * ) &y; // evil floating point bit level hacking | |
i = 0x5f3759df - ( i >> 1 ); // what the fuck? | |
y = * ( float * ) &i; | |
y = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration | |
// y = y * ( threehalfs - ( x2 * y * y ) ); // 2nd iteration, this can be removed | |
return y; | |
} | |
void q_rsqrt() { | |
for (int i = 1; i <= 8E8; i++) { | |
Q_rsqrt(i); | |
} | |
} | |
void sse_rsqrt(long start, long end) { | |
__m128 a; | |
__m128 result; | |
for (long i = start; i+3 <= end; i+=4) { | |
a = _mm_set_ps(i, i+1, i+2, i+3); | |
result = _mm_rsqrt_ps(a); | |
} | |
} | |
void sse_rsqrt_mt() { | |
std::thread t[8]; | |
long shard = 8E9 / 8; | |
for (int i = 0; i < 8; i++) { | |
t[i] = std::thread(sse_rsqrt, i * shard + 1, (i+1) * shard); | |
t[i].join(); | |
} | |
} | |
void sse_rsqrt_mc() { | |
int i; | |
long shard = 8E9 / 8; | |
dispatch_queue_t sse_q = dispatch_queue_create("sse", DISPATCH_QUEUE_CONCURRENT); | |
for (i = 0; i < 7; i++) { | |
dispatch_async(sse_q, ^{ sse_rsqrt(i * shard + 1, (i+1) * shard); }); | |
} | |
dispatch_sync(sse_q, ^{ sse_rsqrt(i * shard + 1, (i+1) * shard); }); | |
} | |
void avx_rsqrt(long start, long end) { | |
__m256 a; | |
__m256 result; | |
for (long i = start; i+7 <= end; i+=8) { | |
a = _mm256_set_ps(i, i+1, i+2, i+3, i+4, i+5, i+6, i+7); | |
result = _mm256_rsqrt_ps(a); | |
} | |
} | |
void avx_rsqrt_mt() { | |
std::thread t[8]; | |
long shard = 8E9 / 8; | |
for (int i = 0; i < 8; i++) { | |
t[i] = std::thread(avx_rsqrt, i * shard + 1, (i+1) * shard); | |
t[i].join(); | |
} | |
} | |
void avx_rsqrt_mc() { | |
int i; | |
long shard = 8E9 / 8; | |
dispatch_queue_t avx_q = dispatch_queue_create("avx", DISPATCH_QUEUE_CONCURRENT); | |
for (i = 0; i < 7; i++) { | |
dispatch_async(avx_q, ^{ avx_rsqrt(i * shard + 1, (i+1) * shard); }); | |
} | |
dispatch_sync(avx_q, ^{ avx_rsqrt(i * shard + 1, (i+1) * shard); }); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
auto avg = (measure<>::duration(sse_rsqrt, 1, 8E9) + measure<>::duration(sse_rsqrt, 1, 8E9)) / 2.0; | |
std::cout << "sse: " << avg.count() << std::endl; | |
avg = (measure<>::duration(sse_rsqrt_mc) + measure<>::duration(sse_rsqrt_mc)) / 2.0; | |
std::cout << "sse multicore: " << avg.count() << std::endl; | |
avg = (measure<>::duration(avx_rsqrt, 1, 8E9) + measure<>::duration(avx_rsqrt, 1, 8E9)) / 2.0; | |
std::cout << "avx: " << avg.count() << std::endl; | |
avg = (measure<>::duration(avx_rsqrt_mc) + measure<>::duration(avx_rsqrt_mc)) / 2.0; | |
std::cout << "avx multicore: " << avg.count() << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment