Skip to content

Instantly share code, notes, and snippets.

@inflation
Created January 9, 2017 04:56
Show Gist options
  • Save inflation/6d1120a8889c8fbeff65955331fe62cd to your computer and use it in GitHub Desktop.
Save inflation/6d1120a8889c8fbeff65955331fe62cd to your computer and use it in GitHub Desktop.
#include <iostream>
#include <chrono>
#include <thread>
#include <dispatch/dispatch.h> // macOS only
#include <immintrin.h>
template<typename TimeT = std::chrono::milliseconds>
struct measure
{
template<typename F, typename ...Args>
static auto duration(F&& func, Args&&... args)
{
auto start = std::chrono::steady_clock::now();
std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
return std::chrono::duration_cast<TimeT>(std::chrono::steady_clock::now()-start);
}
};
inline float Q_rsqrt( float number )
{
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = number * 0.5F;
y = number;
i = * ( long * ) &y; // evil floating point bit level hacking
i = 0x5f3759df - ( i >> 1 ); // what the fuck?
y = * ( float * ) &i;
y = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration
// y = y * ( threehalfs - ( x2 * y * y ) ); // 2nd iteration, this can be removed
return y;
}
void q_rsqrt() {
for (int i = 1; i <= 8E8; i++) {
Q_rsqrt(i);
}
}
void sse_rsqrt(long start, long end) {
__m128 a;
__m128 result;
for (long i = start; i+3 <= end; i+=4) {
a = _mm_set_ps(i, i+1, i+2, i+3);
result = _mm_rsqrt_ps(a);
}
}
void sse_rsqrt_mt() {
std::thread t[8];
long shard = 8E9 / 8;
for (int i = 0; i < 8; i++) {
t[i] = std::thread(sse_rsqrt, i * shard + 1, (i+1) * shard);
t[i].join();
}
}
void sse_rsqrt_mc() {
int i;
long shard = 8E9 / 8;
dispatch_queue_t sse_q = dispatch_queue_create("sse", DISPATCH_QUEUE_CONCURRENT);
for (i = 0; i < 7; i++) {
dispatch_async(sse_q, ^{ sse_rsqrt(i * shard + 1, (i+1) * shard); });
}
dispatch_sync(sse_q, ^{ sse_rsqrt(i * shard + 1, (i+1) * shard); });
}
void avx_rsqrt(long start, long end) {
__m256 a;
__m256 result;
for (long i = start; i+7 <= end; i+=8) {
a = _mm256_set_ps(i, i+1, i+2, i+3, i+4, i+5, i+6, i+7);
result = _mm256_rsqrt_ps(a);
}
}
void avx_rsqrt_mt() {
std::thread t[8];
long shard = 8E9 / 8;
for (int i = 0; i < 8; i++) {
t[i] = std::thread(avx_rsqrt, i * shard + 1, (i+1) * shard);
t[i].join();
}
}
void avx_rsqrt_mc() {
int i;
long shard = 8E9 / 8;
dispatch_queue_t avx_q = dispatch_queue_create("avx", DISPATCH_QUEUE_CONCURRENT);
for (i = 0; i < 7; i++) {
dispatch_async(avx_q, ^{ avx_rsqrt(i * shard + 1, (i+1) * shard); });
}
dispatch_sync(avx_q, ^{ avx_rsqrt(i * shard + 1, (i+1) * shard); });
}
int main(int argc, char* argv[])
{
auto avg = (measure<>::duration(sse_rsqrt, 1, 8E9) + measure<>::duration(sse_rsqrt, 1, 8E9)) / 2.0;
std::cout << "sse: " << avg.count() << std::endl;
avg = (measure<>::duration(sse_rsqrt_mc) + measure<>::duration(sse_rsqrt_mc)) / 2.0;
std::cout << "sse multicore: " << avg.count() << std::endl;
avg = (measure<>::duration(avx_rsqrt, 1, 8E9) + measure<>::duration(avx_rsqrt, 1, 8E9)) / 2.0;
std::cout << "avx: " << avg.count() << std::endl;
avg = (measure<>::duration(avx_rsqrt_mc) + measure<>::duration(avx_rsqrt_mc)) / 2.0;
std::cout << "avx multicore: " << avg.count() << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment