bangonkali · October 7, 2018 19:39
diff --git a/dev b/dev
 gcc -O2 -std=c99 -msse  -lrt -lm test.c -otest
 sudo apt-get install linux-headers-$(uname -r) build-essential

diff --git a/references b/references
 http://stackoverflow.com/questions/27433045/why-does-this-simd-example-code-in-c-compile-with-mingw-but-the-executable-doesn
 http://stackoverflow.com/questions/5217812/c-compilation-issue-with-emmintrin-h-on-linux-gcc
 http://blogs.microsoft.co.il/sasha/2011/10/17/simd-optimized-c-code-in-visual-studio-11/
diff --git a/test.c b/test.c
 #include <stdio.h>
 #include <math.h>
 #include <emmintrin.h>
 #include <time.h>
 #if defined(_MSC_VER)
     /* Microsoft C/C++-compatible compiler */
     #include <intrin.h>
 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
     /* GCC-compatible compiler, targeting x86/x86-64 */
     #include <x86intrin.h>
 #elif defined(__GNUC__) && defined(__ARM_NEON__)
     /* GCC-compatible compiler, targeting ARM with NEON */
     #include <arm_neon.h>
 #elif defined(__GNUC__) && defined(__IWMMXT__)
     /* GCC-compatible compiler, targeting ARM with WMMX */
     #include <mmintrin.h>
 #elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
     /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
     #include <altivec.h>
 #elif defined(__GNUC__) && defined(__SPE__)
     /* GCC-compatible compiler, targeting PowerPC with SPE */
     #include <spe.h>
 #endif

 void __attribute__((noinline)) printv(__m128 m)
 {
 	union
 	{
 		float val[4];
 		__m128 sse;
 	} u;
 	u.sse = m;
 	printf("[%g, %g, %g, %g]\n", u.val[3], u.val[2], u.val[1], u.val[0]);
 }

 void sqrt_normal(float* a, int N)                                                                                                                                                                                     
 {                                                                                                                                                                                                                
 	for (int i = 0; i < N; ++i)                                                                                                                                                                                    
 		a[i] = sqrt(a[i]);                                                                                                                                                                                           
 }                                                                                                                                                                                                                

 void sqrt_sse(float* a, int N)                                                                                                                                                                                        
 {                      
 	// We assume N % 4 == 0.                                                                                                                                                                                        
 	int nb_iters = N / 4;                                                                                                                                                                                         
 	__m128* ptr = (__m128*)a;                                                                                                                                                                                      

 	for (int i = 0; i < nb_iters; ++i, ++ptr, a += 4)                                                                                                                                                              
 		_mm_store_ps(a, _mm_sqrt_ps(*ptr));                                                                                                                                                                          
 } 

 int main(int argc, char **argv)
 {
 	printf("Basic example\n");
 	__m128 m = _mm_set_ps(4, 3, 2, 2);
 	__m128 z = _mm_setzero_ps();

 	printv(m);
 	printv(z);

 	printf("Arithmetic example\n");

 	__m128 m_arithmetic = _mm_set_ps(-4, -3, -2, -1);
 	__m128 one = _mm_set1_ps(1.5f);

 	printv(_mm_and_ps(m_arithmetic, _mm_setzero_ps())); // Always a zero vector
 	printv(_mm_or_ps(m_arithmetic, _mm_set1_ps(-0.0f))); // Negate all (nop, all negative)
 	printv(_mm_add_ps(m_arithmetic, _mm_setzero_ps())); // Add 0 (nop; x+0=x)
 	printv(_mm_add_ps(m_arithmetic, m)); // Add to m
 	printv(_mm_sub_ps(m_arithmetic, _mm_setzero_ps())); // Substruct 0 (nop; x-0=x)
 	printv(_mm_sub_ps(m_arithmetic, m)); // Substruct 0 (nop; x-0=x)
 	printv(_mm_mul_ps(m_arithmetic, one)); // Multiply by one (nop)
 	printv(_mm_div_ps(m_arithmetic, one)); // Division by one (nop)

 	printf("Shuffle example\n");
 	__m128 m_shuffle = _mm_set_ps(4, 3, 2, 1);
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xE4); // NOP - shuffles to same order
 	printv(m_shuffle);

 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP
 	printv(m_shuffle);

 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // All should be optimized to one shuffle
 	printv(m_shuffle);

 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xC9); // Those two shuffles together swap pairs
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x2D); // And could be optimized to 0x4E
 	printv(m_shuffle);

 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // First element
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Redundant - since all are the same
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Let's stress it again
 	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // And one last time
 	printv(m_shuffle);

 	printf("Sqrt example\n");
 	float a[] __attribute__ ((aligned (16))) = { 41982.,  81.5091, 3.14, 42.666 };// Mabali ang answer
 	__m128* ptr = (__m128*)a;
 	__m128 t = _mm_sqrt_ps(*ptr);
 	printv(t);	

 	printf("Benchmark Sqrt example \n");
 	if (argc != 2)                                                                                                                                                                                                 
 		return 1;                                                                                                                                                                                                    
 	int N = atoi(argv[1]);                                                                                                                                                                                         

 	float* a_benchmark;                                                                                                                                                                                                      
 	posix_memalign((void**)&a_benchmark, 16,  N * sizeof(float));                                                                                                                                                            


 	clock_t start = clock(), diff;
 	for (int i = 0; i < N; ++i)                                                                                                                                                                                    
 		a_benchmark[i] = 3141592.65358;                                                                                                                                                                                        

 	{ 
 		sqrt_normal(a_benchmark, N);                                                                                                                                                                                               

 	}         
 	diff = clock() - start;

 	int msec = diff * 1000 / CLOCKS_PER_SEC;
 	printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);                                                                                                                                                                                                     


 	start = clock(), diff;
 	for (int i = 0; i < N; ++i)                                                                                                                                                                                    
 		a_benchmark[i] = 3141592.65358;                                                                                                                                                                                        

 	{                                                                                                                                                                                                              
 		// TIMER("SSE");                                                                                                                                                                                                
 		sqrt_sse(a_benchmark, N);                                                                                                                                                                                                   
 	}
 	diff = clock() - start;

 	msec = diff * 1000 / CLOCKS_PER_SEC;
 	printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);   
 	return 0;
 }
	gcc -O2 -std=c99 -msse -lrt -lm test.c -otest
	sudo apt-get install linux-headers-$(uname -r) build-essential
	http://stackoverflow.com/questions/27433045/why-does-this-simd-example-code-in-c-compile-with-mingw-but-the-executable-doesn
	http://stackoverflow.com/questions/5217812/c-compilation-issue-with-emmintrin-h-on-linux-gcc
	http://blogs.microsoft.co.il/sasha/2011/10/17/simd-optimized-c-code-in-visual-studio-11/
	#include <stdio.h>
	#include <math.h>
	#include <emmintrin.h>
	#include <time.h>
	#if defined(_MSC_VER)
	/* Microsoft C/C++-compatible compiler */
	#include <intrin.h>
	#elif defined(__GNUC__) && (defined(__x86_64__) \|\| defined(__i386__))
	/* GCC-compatible compiler, targeting x86/x86-64 */
	#include <x86intrin.h>
	#elif defined(__GNUC__) && defined(__ARM_NEON__)
	/* GCC-compatible compiler, targeting ARM with NEON */
	#include <arm_neon.h>
	#elif defined(__GNUC__) && defined(__IWMMXT__)
	/* GCC-compatible compiler, targeting ARM with WMMX */
	#include <mmintrin.h>
	#elif (defined(__GNUC__) \|\| defined(__xlC__)) && (defined(__VEC__) \|\| defined(__ALTIVEC__))
	/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
	#include <altivec.h>
	#elif defined(__GNUC__) && defined(__SPE__)
	/* GCC-compatible compiler, targeting PowerPC with SPE */
	#include <spe.h>
	#endif

	void __attribute__((noinline)) printv(__m128 m)
	{
	union
	{
	float val[4];
	__m128 sse;
	} u;
	u.sse = m;
	printf("[%g, %g, %g, %g]\n", u.val[3], u.val[2], u.val[1], u.val[0]);
	}

	void sqrt_normal(float* a, int N)
	{
	for (int i = 0; i < N; ++i)
	a[i] = sqrt(a[i]);
	}

	void sqrt_sse(float* a, int N)
	{
	// We assume N % 4 == 0.
	int nb_iters = N / 4;
	__m128* ptr = (__m128*)a;

	for (int i = 0; i < nb_iters; ++i, ++ptr, a += 4)
	_mm_store_ps(a, _mm_sqrt_ps(*ptr));
	}

	int main(int argc, char **argv)
	{
	printf("Basic example\n");
	__m128 m = _mm_set_ps(4, 3, 2, 2);
	__m128 z = _mm_setzero_ps();

	printv(m);
	printv(z);

	printf("Arithmetic example\n");

	__m128 m_arithmetic = _mm_set_ps(-4, -3, -2, -1);
	__m128 one = _mm_set1_ps(1.5f);

	printv(_mm_and_ps(m_arithmetic, _mm_setzero_ps())); // Always a zero vector
	printv(_mm_or_ps(m_arithmetic, _mm_set1_ps(-0.0f))); // Negate all (nop, all negative)
	printv(_mm_add_ps(m_arithmetic, _mm_setzero_ps())); // Add 0 (nop; x+0=x)
	printv(_mm_add_ps(m_arithmetic, m)); // Add to m
	printv(_mm_sub_ps(m_arithmetic, _mm_setzero_ps())); // Substruct 0 (nop; x-0=x)
	printv(_mm_sub_ps(m_arithmetic, m)); // Substruct 0 (nop; x-0=x)
	printv(_mm_mul_ps(m_arithmetic, one)); // Multiply by one (nop)
	printv(_mm_div_ps(m_arithmetic, one)); // Division by one (nop)

	printf("Shuffle example\n");
	__m128 m_shuffle = _mm_set_ps(4, 3, 2, 1);
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xE4); // NOP - shuffles to same order
	printv(m_shuffle);

	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP
	printv(m_shuffle);

	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // Reverses the vector again, NOP
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x1B); // All should be optimized to one shuffle
	printv(m_shuffle);

	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0xC9); // Those two shuffles together swap pairs
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x2D); // And could be optimized to 0x4E
	printv(m_shuffle);

	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // First element
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Redundant - since all are the same
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // Let's stress it again
	m_shuffle = _mm_shuffle_ps(m_shuffle, m_shuffle, 0x55); // And one last time
	printv(m_shuffle);

	printf("Sqrt example\n");
	float a[] __attribute__ ((aligned (16))) = { 41982., 81.5091, 3.14, 42.666 };// Mabali ang answer
	__m128* ptr = (__m128*)a;
	__m128 t = _mm_sqrt_ps(*ptr);
	printv(t);

	printf("Benchmark Sqrt example \n");
	if (argc != 2)
	return 1;
	int N = atoi(argv[1]);

	float* a_benchmark;
	posix_memalign((void*)&a_benchmark, 16, N sizeof(float));


	clock_t start = clock(), diff;
	for (int i = 0; i < N; ++i)
	a_benchmark[i] = 3141592.65358;

	{
	sqrt_normal(a_benchmark, N);

	}
	diff = clock() - start;

	int msec = diff * 1000 / CLOCKS_PER_SEC;
	printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);


	start = clock(), diff;
	for (int i = 0; i < N; ++i)
	a_benchmark[i] = 3141592.65358;

	{
	// TIMER("SSE");
	sqrt_sse(a_benchmark, N);
	}
	diff = clock() - start;

	msec = diff * 1000 / CLOCKS_PER_SEC;
	printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
	return 0;
	}