Last active
December 13, 2015 22:19
-
-
Save emoon/4983834 to your computer and use it in GitHub Desktop.
Find the 4 highest values in an array and their indices
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <intrin.h> | |
#include <emmintrin.h> | |
#include <float.h> | |
#include <assert.h> | |
#include <stdlib.h> | |
int main(int argc, char* argv[]) | |
{ | |
const float a[] = { 1, 2, 3, 4, 1.5f, 4, 2, 12, 3, 9, 5, 9, 1, 2, 12, 3, 1, 59, 3, 4, 9 }; | |
const size_t acount = sizeof(a)/sizeof(a[0]); | |
// Find the 4 highest values in a and their indices | |
__declspec(align(16)) float sinit = -FLT_MAX; | |
__declspec(align(16)) int iinit[4] = {-1, -1, -1, -1}; | |
// Initialize all the scores to -FLT_MAX | |
__m128 s = _mm_load_ps1(&sinit); | |
// We just do shuffles and blends of the indices, so we store the ints as floats. | |
__m128 index = _mm_load_ps((float*)iinit); | |
int i = 0; | |
for(const float* pa = a, *paend = a + acount; pa != paend; ++pa, ++i) | |
{ | |
// Load the index into all 4 elements of im | |
__m128 im = _mm_load_ps1((float*)&i); | |
// Load a value from the array into all 4 elements in v | |
__m128 v = _mm_load_ps1(pa); | |
// Compare with the currently best scores | |
__m128 cmp = _mm_cmpge_ps(v, s); | |
// Convert to a mask which is one of 0000, 1000, 1100, 1110 or 1111 | |
// Switch on the mask and shuffle/blend as appropriate. | |
// The same operation is done on both s and index to keep them in sync. | |
switch(_mm_movemask_ps(cmp)) | |
{ | |
case 0x0: | |
// dcba -> dcba | |
break; | |
case 0x8: | |
// dcba -> Vcba | |
s = _mm_blend_ps(s, v, 8); | |
index = _mm_blend_ps(index, im, 8); | |
break; | |
case 0xc: | |
// dcba -> cVba | |
s = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 1, 0)); | |
s = _mm_blend_ps(s, v, 4); | |
index = _mm_shuffle_ps(index, index, _MM_SHUFFLE(2, 2, 1, 0)); | |
index = _mm_blend_ps(index, im, 4); | |
break; | |
case 0xe: | |
// dcba -> cbVa | |
s = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 1, 1, 0)); | |
s = _mm_blend_ps(s, v, 2); | |
index = _mm_shuffle_ps(index, index, _MM_SHUFFLE(2, 1, 1, 0)); | |
index = _mm_blend_ps(index, im, 2); | |
break; | |
case 0xf: | |
// dcba -> cbaV | |
s = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 1, 0, 0)); | |
s = _mm_blend_ps(s, v, 1); | |
index = _mm_shuffle_ps(index, index, _MM_SHUFFLE(2, 1, 0, 0)); | |
index = _mm_blend_ps(index, im, 1); | |
break; | |
default: | |
assert(0); | |
break; | |
} | |
} | |
// Exctract the results. For < 4 values we could check either value or index | |
// for -FLT_MAX or -1 respectively | |
__declspec(align(16)) float highestValues[4]; | |
_mm_store_ps(highestValues, s); | |
__declspec(align(16)) int indexWithHighestValue[4]; | |
_mm_store_ps((float*)indexWithHighestValue, index); | |
// Prevent optimizations | |
return (int)highestValues[0] + indexWithHighestValue[0]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment