Last active
September 3, 2021 12:08
-
-
Save kbenzie/5323248 to your computer and use it in GitHub Desktop.
Example of using SSE instructions showing the increase in performance.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <xmmintrin.h> | |
#include <cmath> | |
#include <iostream> | |
#include <omp.h> | |
/* | |
* void example() | |
* { | |
* // Size of float array | |
* const int size = 4; | |
* | |
* #ifndef DYNAMIC | |
* // Each float array processed by SSE instruction should have | |
* // 16-byte alignment. | |
* __declspec(align(16)) float m_floats[size]; | |
* #else | |
* // Use _aligned_malloc to dynamically alocate the array. | |
* auto m_floats = (float*)_aligned_malloc(size * sizeof(float), 16); | |
* #endif | |
* | |
* // Note: _m128 types should not be accessed directly, they are | |
* // aligned on 16-byte boundaries. | |
* | |
* #ifdef DYNAMIC | |
* _aligned_free(m_floats); | |
* #endif | |
* } | |
*/ | |
void compute_array_cpp(float *array1, // [in] first source array | |
float *array2, // [in] second source array | |
float *result, // [out] result array | |
int size) // [in] size of arrays | |
{ | |
auto source1 = array1; | |
auto source2 = array2; | |
auto dest = result; | |
for (auto i=0; i<size; ++i) { | |
*dest = sqrt((*source1) * (*source2) + (*source2) | |
* (*source2)) + 0.5f; | |
source1++; | |
source2++; | |
dest++; | |
} | |
} | |
void compute_array_sse(float *array1, // [in] first source array | |
float *array2, // [in] second source array | |
float *result, // [out] result array | |
int size) // [in] size of arrays | |
{ | |
__m128 m1, m2, m3, m4; | |
__m128 *source1 = (__m128*)array1; | |
__m128 *source2 = (__m128*)array2; | |
__m128 *dest = (__m128*)result; | |
__m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0,1,2,3] = 0.5 | |
for (auto i=0; i<size/4; ++i) | |
{ | |
m1 = _mm_mul_ps(*source1, *source2); // m1 = (*source1) * (*source2) | |
m2 = _mm_mul_ps(*source2, *source2); // m2 = (*source2) * (*source2) | |
m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2 | |
m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3) | |
*dest = _mm_add_ps(m4, m0_5); // *dest = m4 + 0.5 | |
source1++; | |
source2++; | |
dest++; | |
} | |
} | |
int main() | |
{ | |
using namespace std; | |
cout << "Allocating aligned array blocks\n"; | |
auto size = 100000; | |
auto source1 = (float*)_aligned_malloc(size * sizeof(float), 16); | |
auto source2 = (float*)_aligned_malloc(size * sizeof(float), 16); | |
auto result_cpp = (float*)_aligned_malloc(size * sizeof(float), 16); | |
auto result_sse = (float*)_aligned_malloc(size * sizeof(float), 16); | |
cout << "Assigning random numbers to source arrays\n"; | |
srand(int(omp_get_wtime())); | |
for (auto i=0; i<size; ++i) { | |
source1[i] = float(rand() % 1000 - 500); | |
source2[i] = float(rand() % 1000 - 500); | |
} | |
double start, end, time_cpp, time_sse; | |
cout << "Benchmarking standard cpp implementation: "; | |
start = omp_get_wtime(); | |
compute_array_cpp(source1, source2, result_cpp, size); | |
end = omp_get_wtime(); | |
time_cpp = end-start; | |
cout << time_cpp << " seconds\n"; | |
cout << "Benchmarking sse implementation: "; | |
start = omp_get_wtime(); | |
compute_array_sse(source1, source2, result_sse, size); | |
end = omp_get_wtime(); | |
time_sse = end-start; | |
cout << time_sse << " seconds\n"; | |
cout << time_cpp/time_sse << "x speedup\n"; | |
cout << "Freeing aligned array blocks\n"; | |
_aligned_free(source1); | |
_aligned_free(source2); | |
_aligned_free(result_cpp); | |
_aligned_free(result_sse); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment