Skip to content

Instantly share code, notes, and snippets.

@emoon
Last active December 13, 2015 22:19
Show Gist options
  • Save emoon/4983834 to your computer and use it in GitHub Desktop.
Save emoon/4983834 to your computer and use it in GitHub Desktop.
Find the 4 highest values in an array and their indices
#include <intrin.h>
#include <emmintrin.h>
#include <float.h>
#include <assert.h>
#include <stdlib.h>
int main(int argc, char* argv[])
{
const float a[] = { 1, 2, 3, 4, 1.5f, 4, 2, 12, 3, 9, 5, 9, 1, 2, 12, 3, 1, 59, 3, 4, 9 };
const size_t acount = sizeof(a)/sizeof(a[0]);
// Find the 4 highest values in a and their indices
__declspec(align(16)) float sinit = -FLT_MAX;
__declspec(align(16)) int iinit[4] = {-1, -1, -1, -1};
// Initialize all the scores to -FLT_MAX
__m128 s = _mm_load_ps1(&sinit);
// We just do shuffles and blends of the indices, so we store the ints as floats.
__m128 index = _mm_load_ps((float*)iinit);
int i = 0;
for(const float* pa = a, *paend = a + acount; pa != paend; ++pa, ++i)
{
// Load the index into all 4 elements of im
__m128 im = _mm_load_ps1((float*)&i);
// Load a value from the array into all 4 elements in v
__m128 v = _mm_load_ps1(pa);
// Compare with the currently best scores
__m128 cmp = _mm_cmpge_ps(v, s);
// Convert to a mask which is one of 0000, 1000, 1100, 1110 or 1111
// Switch on the mask and shuffle/blend as appropriate.
// The same operation is done on both s and index to keep them in sync.
switch(_mm_movemask_ps(cmp))
{
case 0x0:
// dcba -> dcba
break;
case 0x8:
// dcba -> Vcba
s = _mm_blend_ps(s, v, 8);
index = _mm_blend_ps(index, im, 8);
break;
case 0xc:
// dcba -> cVba
s = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 1, 0));
s = _mm_blend_ps(s, v, 4);
index = _mm_shuffle_ps(index, index, _MM_SHUFFLE(2, 2, 1, 0));
index = _mm_blend_ps(index, im, 4);
break;
case 0xe:
// dcba -> cbVa
s = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 1, 1, 0));
s = _mm_blend_ps(s, v, 2);
index = _mm_shuffle_ps(index, index, _MM_SHUFFLE(2, 1, 1, 0));
index = _mm_blend_ps(index, im, 2);
break;
case 0xf:
// dcba -> cbaV
s = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 1, 0, 0));
s = _mm_blend_ps(s, v, 1);
index = _mm_shuffle_ps(index, index, _MM_SHUFFLE(2, 1, 0, 0));
index = _mm_blend_ps(index, im, 1);
break;
default:
assert(0);
break;
}
}
// Exctract the results. For < 4 values we could check either value or index
// for -FLT_MAX or -1 respectively
__declspec(align(16)) float highestValues[4];
_mm_store_ps(highestValues, s);
__declspec(align(16)) int indexWithHighestValue[4];
_mm_store_ps((float*)indexWithHighestValue, index);
// Prevent optimizations
return (int)highestValues[0] + indexWithHighestValue[0];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment