Skip to content

Instantly share code, notes, and snippets.

@bangonkali
Last active October 7, 2018 19:39
Show Gist options
  • Save bangonkali/52137c168f9bb0aaf003 to your computer and use it in GitHub Desktop.
Save bangonkali/52137c168f9bb0aaf003 to your computer and use it in GitHub Desktop.
SIMD
gcc -O2 -std=c99 -msse -lrt -lm test.c -otest
sudo apt-get install linux-headers-$(uname -r) build-essential
http://stackoverflow.com/questions/27433045/why-does-this-simd-example-code-in-c-compile-with-mingw-but-the-executable-doesn
http://stackoverflow.com/questions/5217812/c-compilation-issue-with-emmintrin-h-on-linux-gcc
http://blogs.microsoft.co.il/sasha/2011/10/17/simd-optimized-c-code-in-visual-studio-11/
#include <stdio.h>
#include <math.h>
#include <emmintrin.h>
#include <time.h>
#if defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#elif defined(__GNUC__) && defined(__ARM_NEON__)
/* GCC-compatible compiler, targeting ARM with NEON */
#include <arm_neon.h>
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
#elif defined(__GNUC__) && defined(__SPE__)
/* GCC-compatible compiler, targeting PowerPC with SPE */
#include <spe.h>
#endif
void __attribute__((noinline)) printv(__m128 m)
{
union
{
float val[4];
__m128 sse;
} u;
u.sse = m;
printf("[%g, %g, %g, %g]\n", u.val[3], u.val[2], u.val[1], u.val[0]);
}
void sqrt_normal(float* a, int N)
{
for (int i = 0; i < N; ++i)
a[i] = sqrt(a[i]);
}
void sqrt_sse(float* a, int N)
{
// We assume N % 4 == 0.
int nb_iters = N / 4;
__m128* ptr = (__m128*)a;
for (int i = 0; i < nb_iters; ++i, ++ptr, a += 4)
_mm_store_ps(a, _mm_sqrt_ps(*ptr));
}
int main(int argc, char **argv)
{
printf("Basic example\n");
__m128 m = _mm_set_ps(4, 3, 2, 2);
__m128 z = _mm_setzero_ps();
printv(m);
printv(z);
printf("Arithmetic example\n");
__m128 m_arithmetic = _mm_set_ps(-4, -3, -2, -1);
__m128 one = _mm_set1_ps(1.5f);
printv(_mm_and_ps(m_arithmetic, _mm_setzero_ps())); // Always a zero vector
printv(_mm_or_ps(m_arithmetic, _mm_set1_ps(-0.0f))); // Negate all (nop, all negative)
printv(_mm_add_ps(m_arithmetic, _mm_setzero_ps())); // Add 0 (nop; x+0=x)
printv(_mm_add_ps(m_arithmetic, m)); // Add to m
printv(_mm_sub_ps(m_arithmetic, _mm_setzero_ps())); // Substruct 0 (nop; x-0=x)
printv(_mm_sub_ps(m_arithmetic, m)); // Substruct 0 (nop; x-0=x)
printv(_mm_mul_ps(m_arithmetic, one)); // Multiply by one (nop)
printv(_mm_div_ps(m_arithmetic, one)); // Division by one (nop)
printf("Shuffle example\n");
__m128 m_shuffle = _mm_set_ps(4, 3, 2, 1);
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xE4); // NOP - shuffles to same order
printv(m_shuffle);
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP
printv(m_shuffle);
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // All should be optimized to one shuffle
printv(m_shuffle);
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xC9); // Those two shuffles together swap pairs
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x2D); // And could be optimized to 0x4E
printv(m_shuffle);
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // First element
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Redundant - since all are the same
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Let's stress it again
m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // And one last time
printv(m_shuffle);
printf("Sqrt example\n");
float a[] __attribute__ ((aligned (16))) = { 41982., 81.5091, 3.14, 42.666 };// Mabali ang answer
__m128* ptr = (__m128*)a;
__m128 t = _mm_sqrt_ps(*ptr);
printv(t);
printf("Benchmark Sqrt example \n");
if (argc != 2)
return 1;
int N = atoi(argv[1]);
float* a_benchmark;
posix_memalign((void**)&a_benchmark, 16, N * sizeof(float));
clock_t start = clock(), diff;
for (int i = 0; i < N; ++i)
a_benchmark[i] = 3141592.65358;
{
sqrt_normal(a_benchmark, N);
}
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
start = clock(), diff;
for (int i = 0; i < N; ++i)
a_benchmark[i] = 3141592.65358;
{
// TIMER("SSE");
sqrt_sse(a_benchmark, N);
}
diff = clock() - start;
msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment