Created
August 7, 2017 19:04
-
-
Save kuribas/ddd977968c3a56eac505901acb3e8eee to your computer and use it in GitHub Desktop.
watertower problem in haskell, C and SSE
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
benchmarking rainfall/list | |
time 2.697 s (2.532 s .. 3.065 s) | |
0.998 R² (NaN R² .. 1.000 R²) | |
mean 2.771 s (2.712 s .. 2.800 s) | |
std dev 51.39 ms (0.0 s .. 51.77 ms) | |
variance introduced by outliers: 19% (moderately inflated) | |
benchmarking rainfall/unboxed vector | |
time 51.66 ms (51.59 ms .. 51.83 ms) | |
1.000 R² (1.000 R² .. 1.000 R²) | |
mean 19.44 ms (12.80 ms .. 24.12 ms) | |
std dev 6.520 ms (2.357 ms .. 8.719 ms) | |
variance introduced by outliers: 86% (severely inflated) | |
benchmarking rainfall/C | |
time 27.10 ms (27.06 ms .. 27.15 ms) | |
1.000 R² (1.000 R² .. 1.000 R²) | |
mean 11.23 ms (8.686 ms .. 13.19 ms) | |
std dev 3.755 ms (2.703 ms .. 5.055 ms) | |
variance introduced by outliers: 88% (severely inflated) | |
benchmarking rainfall/C sse | |
time 12.15 ms (12.13 ms .. 12.18 ms) | |
1.000 R² (1.000 R² .. 1.000 R²) | |
mean 3.530 ms (2.794 ms .. 4.123 ms) | |
std dev 1.059 ms (786.0 μs .. 1.469 ms) | |
variance introduced by outliers: 92% (severely inflated) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "smmintrin.h" | |
#include "malloc.h" | |
#include "stdlib.h" | |
#include "stdio.h" | |
#define MAX(x,y) (x > y ? x : y) | |
#define MIN(x,y) (x > y ? y : x) | |
unsigned int tower(unsigned int length, unsigned int *towers) | |
{ | |
unsigned int *maxt = malloc(sizeof(unsigned int)*length); | |
int i, mx; | |
unsigned int sum; | |
mx = 0; | |
for(i = 0; i < length; i++) | |
{ | |
mx = MAX(mx, towers[i]); | |
maxt[i] = mx; | |
} | |
mx = 0; | |
sum = 0; | |
for(i = length-1; i>=0; i--) | |
{ | |
mx = MAX(mx, towers[i]); | |
sum += MIN(maxt[i], mx) - towers[i]; | |
} | |
free(maxt); | |
return sum; | |
} | |
unsigned int tower2(unsigned int length, unsigned int *towers) | |
{ | |
unsigned char *maxt, *towers8; | |
int i, length8; | |
__m128i vec1, vec2, vec3, sum; | |
length8 = ((length+15)/16); | |
towers8 = memalign(sizeof(__m128i), length8*16); | |
maxt = memalign(sizeof(__m128i), length8*16); | |
if(maxt == NULL || towers8 == NULL) | |
return 0; | |
/* copy from int32[] to uint8[] */ | |
for(i = 0; i < length8-2; i++) { | |
vec1 = ((__m128i*)towers)[i*4]; | |
vec2 = ((__m128i*)towers)[i*4+1]; | |
vec1 = _mm_packus_epi32(vec1, vec2); | |
vec2 = ((__m128i*)towers)[i*4+2]; | |
vec3 = ((__m128i*)towers)[i*4+3]; | |
vec2 = _mm_packus_epi32(vec2, vec3); | |
vec1 = _mm_packus_epi16(vec1, vec2); | |
((__m128i*)towers8)[i] = vec1; | |
} | |
/* copy from int32[] to uint8[] */ | |
for(i = i*16; i < length; i++) { | |
towers8[i] = (unsigned char)towers[i]; | |
} | |
/* pad with zeros */ | |
for(; i < length8*16; i++) | |
towers8[i] = 0; | |
vec2 = _mm_setzero_si128(); | |
for(i = 0; i < length8; i++) { | |
vec1 = _mm_load_si128(((__m128i*)towers8) + i); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_shuffle_epi8(vec1, _mm_setr_epi8(0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,1,1,4,5,5,5,8,9,9,9,12,13,13,13)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,2,3,3,3,3,3,8,9,10,11,11,11,11,11)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,2,3,4,5,6,7,7,7,7,7,7,7,7,7)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
((__m128i*)maxt)[i] = vec1; | |
vec2 = _mm_shuffle_epi8(vec1, _mm_setr_epi8(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15)); | |
} | |
vec2 = _mm_setzero_si128(); | |
sum = _mm_setzero_si128(); | |
for(i = length8-1; i >= 0; i--) { | |
vec3 = _mm_load_si128(((__m128i*)towers8) + i); | |
vec1 = _mm_max_epu8(vec3, vec2); | |
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(2,2,2,3,6,6,6,7,10,10,10,11,14,14,14,15)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(4,4,4,4,4,5,6,7,12,12,12,12,12,13,14,15)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(8,8,8,8,8,8,8,8,8,9,10,11,12,13,14,15)); | |
vec1 = _mm_max_epu8(vec1, vec2); | |
vec2 = _mm_subs_epu8(_mm_min_epu8(((__m128i*)maxt)[i], vec1), vec3); | |
vec3 = _mm_cvtepu8_epi32(vec2); | |
sum = _mm_add_epi32(sum, vec3); | |
vec2 = _mm_shuffle_epi32(vec2, _MM_SHUFFLE(0, 3, 2, 1)); | |
vec3 = _mm_cvtepu8_epi32(vec2); | |
sum = _mm_add_epi32(sum, vec3); | |
vec2 = _mm_shuffle_epi32(vec2, _MM_SHUFFLE(0, 3, 2, 1)); | |
vec3 = _mm_cvtepu8_epi32(vec2); | |
sum = _mm_add_epi32(sum, vec3); | |
vec2 = _mm_shuffle_epi32(vec2, _MM_SHUFFLE(0, 3, 2, 1)); | |
vec3 = _mm_cvtepu8_epi32(vec2); | |
sum = _mm_add_epi32(sum, vec3); | |
vec2 = _mm_shuffle_epi8(vec1, _mm_setzero_si128()); | |
} | |
vec2 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); | |
sum = _mm_add_epi32(vec2, sum); | |
vec2 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(2, 3, 0, 1)); | |
sum = _mm_add_epi32(vec2, sum); | |
free(towers8); | |
free(maxt); | |
return _mm_cvtsi128_si32(sum); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Criterion.Main | |
import System.Random | |
import Data.List | |
import qualified Data.Vector.Unboxed as U | |
import qualified Data.Vector.Storable as S | |
import Foreign | |
import Foreign.C | |
import Foreign.C.Types | |
import System.IO.Unsafe | |
foreign import ccall "tower" tower :: CUInt -> Ptr CUInt -> IO CUInt | |
foreign import ccall "tower2" tower2 :: CUInt -> Ptr CUInt -> IO CUInt | |
rainfallC1 :: S.Vector CUInt -> CUInt | |
rainfallC1 l = | |
let (fptr, len) = S.unsafeToForeignPtr0 l | |
in unsafePerformIO $ | |
withForeignPtr fptr $ | |
tower (fromIntegral len) | |
rainfallC2 :: S.Vector CUInt -> CUInt | |
rainfallC2 l = | |
let (fptr, len) = S.unsafeToForeignPtr0 l | |
in unsafePerformIO $ | |
withForeignPtr fptr $ | |
tower2 (fromIntegral len) | |
rainfall :: U.Vector Int -> Int | |
rainfall xs = U.sum (U.zipWith (-) mins xs) | |
where | |
mins = U.zipWith min maxl maxr | |
maxl = U.scanl1' max xs | |
maxr = U.scanr1' max xs | |
rainfallL :: [Int] -> Int | |
rainfallL xs = sum (zipWith (-) mins xs) | |
where | |
mins = zipWith min maxl maxr | |
maxl = scanl1 max xs | |
maxr = scanr1 max xs | |
-- Our benchmark harness. | |
main = do | |
g <- getStdGen | |
let hs = take 10000000 (randomRs (0, 200) g :: [Int]) | |
hv = U.fromList hs | |
cv = S.fromList $ map fromIntegral hs | |
defaultMain [ | |
bgroup "rainfall" [ | |
bench "list" $ whnf rainfallL hs, | |
bench "unboxed vector" $ whnf rainfall hv, | |
bench "C" $ whnf rainfallC1 cv, | |
bench "C sse" $ whnf rainfallC2 cv | |
] | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment