Skip to content

Instantly share code, notes, and snippets.

@kuribas
Created August 3, 2017 11:14
Show Gist options
  • Save kuribas/55815fb4178bc410043a3277b25f8653 to your computer and use it in GitHub Desktop.
Save kuribas/55815fb4178bc410043a3277b25f8653 to your computer and use it in GitHub Desktop.
sse code
#include "emmintrin.h"
#include "stdlib.h"
int tower(int length, int *towers)
{
unsigned char *maxt, *towers8;
int i, length8;
__m128i vec1, vec2, vec3;
length8 = (length+15)/16;
memalign((void*)towers8, sizeof(__m128i), length8*16);
memalign((void*)maxt, sizeof(__m128i), length8*16);
if(maxt == NULL || towers8 == NULL)
return 0;
/* copy from int32[] to uint8[] */
for(i = 0; i < length; i++)
towers8[i] = (unsigned char)towers[i];
/* pad with zeros */
for(; i < length8*16; i++)
towers8[i] = 0;
vec2 = _mm_setzero_si128();
for(i = 0; i < length8; i++) {
vec1 = _mm_load_si128(((__m128i*)towers) + i);
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1, _mm_setr_epi8(0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,1,1,4,5,5,5,8,9,9,9,12,13,13,13));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,2,3,3,3,3,3,8,9,10,11,11,11,11,11));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,2,3,4,5,6,7,7,7,7,7,7,7,7,7));
vec1 = _mm_max_epu8(vec1, vec2);
((__m128i*)maxt)[i] = vec1;
vec2 = __mm_shuffle_epi8(vec1, __mm_setr_epi8(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15));
}
vec2 = _mm_setzero_si128();
for(i = length8-1; i >= 0; i--) {
vec3 = _mm_load_si128(((__m128i*)towers) + i);
vec1 = _mm_max_epu8(vec3, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(2,2,2,3,6,6,6,7,10,10,10,11,14,14,14,15));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(4,4,4,4,4,5,6,7,12,12,12,12,12,13,14,15));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(7,7,7,7,7,7,7,7,7,8,9,10,12,13,14,15));
vec1 = _mm_max_epu8(vec1, vec2);
((__m128i*)maxt)[i] = _mm_subs_epu8(_min_epu8(((__m128i*)maxt)[i], vec1), vec3);
vec2 = _mm_shuffle_epi8(vec1, _mm_setr_epi8(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1));
}
vec1 = __mm_setzero_si128();
for(i = 0; i < length; i++)
{
vec2 = _mm_setr_epu32((unsigned long)maxt[i], (unsigned long)maxt[i+1],
(unsigned long)maxt[i+2], (unsigned long)maxt[i+3]);
vec1 = _mm_add_epu32(vec2, vec1);
}
vec2 = _mm_shuffle_epi32(vec1, MM_SHUFFLE4(1, 0, 3, 2));
vec1 = _mm_add_epu32(vec2, vec1);
vec2 = _mm_shuffle_epi32(vec1, MM_SHUFFLE4(2, 3, 0, 1));
vec1 = _mm_add_epu32(vec2, vec1);
free(towers);
free(maxt);
return _mm_cvtsi128_si32(vec1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment