-
-
Save bangonkali/52137c168f9bb0aaf003 to your computer and use it in GitHub Desktop.
SIMD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gcc -O2 -std=c99 -msse -lrt -lm test.c -otest | |
sudo apt-get install linux-headers-$(uname -r) build-essential | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
http://stackoverflow.com/questions/27433045/why-does-this-simd-example-code-in-c-compile-with-mingw-but-the-executable-doesn | |
http://stackoverflow.com/questions/5217812/c-compilation-issue-with-emmintrin-h-on-linux-gcc | |
http://blogs.microsoft.co.il/sasha/2011/10/17/simd-optimized-c-code-in-visual-studio-11/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <math.h> | |
#include <emmintrin.h> | |
#include <time.h> | |
#if defined(_MSC_VER) | |
/* Microsoft C/C++-compatible compiler */ | |
#include <intrin.h> | |
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) | |
/* GCC-compatible compiler, targeting x86/x86-64 */ | |
#include <x86intrin.h> | |
#elif defined(__GNUC__) && defined(__ARM_NEON__) | |
/* GCC-compatible compiler, targeting ARM with NEON */ | |
#include <arm_neon.h> | |
#elif defined(__GNUC__) && defined(__IWMMXT__) | |
/* GCC-compatible compiler, targeting ARM with WMMX */ | |
#include <mmintrin.h> | |
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__)) | |
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ | |
#include <altivec.h> | |
#elif defined(__GNUC__) && defined(__SPE__) | |
/* GCC-compatible compiler, targeting PowerPC with SPE */ | |
#include <spe.h> | |
#endif | |
void __attribute__((noinline)) printv(__m128 m) | |
{ | |
union | |
{ | |
float val[4]; | |
__m128 sse; | |
} u; | |
u.sse = m; | |
printf("[%g, %g, %g, %g]\n", u.val[3], u.val[2], u.val[1], u.val[0]); | |
} | |
void sqrt_normal(float* a, int N) | |
{ | |
for (int i = 0; i < N; ++i) | |
a[i] = sqrt(a[i]); | |
} | |
void sqrt_sse(float* a, int N) | |
{ | |
// We assume N % 4 == 0. | |
int nb_iters = N / 4; | |
__m128* ptr = (__m128*)a; | |
for (int i = 0; i < nb_iters; ++i, ++ptr, a += 4) | |
_mm_store_ps(a, _mm_sqrt_ps(*ptr)); | |
} | |
int main(int argc, char **argv) | |
{ | |
printf("Basic example\n"); | |
__m128 m = _mm_set_ps(4, 3, 2, 2); | |
__m128 z = _mm_setzero_ps(); | |
printv(m); | |
printv(z); | |
printf("Arithmetic example\n"); | |
__m128 m_arithmetic = _mm_set_ps(-4, -3, -2, -1); | |
__m128 one = _mm_set1_ps(1.5f); | |
printv(_mm_and_ps(m_arithmetic, _mm_setzero_ps())); // Always a zero vector | |
printv(_mm_or_ps(m_arithmetic, _mm_set1_ps(-0.0f))); // Negate all (nop, all negative) | |
printv(_mm_add_ps(m_arithmetic, _mm_setzero_ps())); // Add 0 (nop; x+0=x) | |
printv(_mm_add_ps(m_arithmetic, m)); // Add to m | |
printv(_mm_sub_ps(m_arithmetic, _mm_setzero_ps())); // Substruct 0 (nop; x-0=x) | |
printv(_mm_sub_ps(m_arithmetic, m)); // Substruct 0 (nop; x-0=x) | |
printv(_mm_mul_ps(m_arithmetic, one)); // Multiply by one (nop) | |
printv(_mm_div_ps(m_arithmetic, one)); // Division by one (nop) | |
printf("Shuffle example\n"); | |
__m128 m_shuffle = _mm_set_ps(4, 3, 2, 1); | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xE4); // NOP - shuffles to same order | |
printv(m_shuffle); | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP | |
printv(m_shuffle); | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // All should be optimized to one shuffle | |
printv(m_shuffle); | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xC9); // Those two shuffles together swap pairs | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x2D); // And could be optimized to 0x4E | |
printv(m_shuffle); | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // First element | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Redundant - since all are the same | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Let's stress it again | |
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // And one last time | |
printv(m_shuffle); | |
printf("Sqrt example\n"); | |
float a[] __attribute__ ((aligned (16))) = { 41982., 81.5091, 3.14, 42.666 };// Mabali ang answer | |
__m128* ptr = (__m128*)a; | |
__m128 t = _mm_sqrt_ps(*ptr); | |
printv(t); | |
printf("Benchmark Sqrt example \n"); | |
if (argc != 2) | |
return 1; | |
int N = atoi(argv[1]); | |
float* a_benchmark; | |
posix_memalign((void**)&a_benchmark, 16, N * sizeof(float)); | |
clock_t start = clock(), diff; | |
for (int i = 0; i < N; ++i) | |
a_benchmark[i] = 3141592.65358; | |
{ | |
sqrt_normal(a_benchmark, N); | |
} | |
diff = clock() - start; | |
int msec = diff * 1000 / CLOCKS_PER_SEC; | |
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000); | |
start = clock(), diff; | |
for (int i = 0; i < N; ++i) | |
a_benchmark[i] = 3141592.65358; | |
{ | |
// TIMER("SSE"); | |
sqrt_sse(a_benchmark, N); | |
} | |
diff = clock() - start; | |
msec = diff * 1000 / CLOCKS_PER_SEC; | |
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment