Created
October 15, 2016 20:31
-
-
Save fatihky/8e650e9e116108a7154cccecb3439bdb to your computer and use it in GitHub Desktop.
sse ile sayi toplama - sum int array with sse simd - sse: 28ms, standard: 97ms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <time.h> | |
| #include <assert.h> | |
| #include <sys/wait.h> | |
| #include <errno.h> | |
| #include <assert.h> | |
| #include <ctype.h> | |
| #include <stdarg.h> | |
| #include <arpa/inet.h> | |
| #include <sys/stat.h> | |
| #include <fcntl.h> | |
| #include <sys/time.h> | |
| #include <emmintrin.h> | |
| #include "fr.h" | |
| /* Return the UNIX time in microseconds */ | |
| long long ustime(void) { | |
| struct timeval tv; | |
| long long ust; | |
| gettimeofday(&tv, NULL); | |
| ust = ((long long)tv.tv_sec)*1000000; | |
| ust += tv.tv_usec; | |
| return ust; | |
| } | |
| /* Return the UNIX time in milliseconds */ | |
| #define mstime(void) (ustime() /1000) | |
| // millisecond | |
| static long long _m; | |
| static long long _u; | |
| // measure start | |
| #define mst() _m = mstime() | |
| #define ust() _u = ustime() | |
| // measure end | |
| #define mend(op) printf ("op %s took %lld ms\n", op, mstime() - _m) | |
| #define uend(op) printf ("op %s took %lld us\n", op, ustime() - _u) | |
| #define f(cnt) for (register int i = 0; i < cnt; i++) | |
| #define fj(cnt) for (register int j = 0; j < cnt; j++) | |
| #define fk(cnt) for (register int k = 0; k < cnt; k++) | |
| static inline int32_t fastrangei32(int32_t word, int32_t p) { | |
| return (int32_t)(((int64_t)word * (int64_t)p) >> 32); | |
| } | |
| int main(int argc, char *argv[]) { | |
| uint64_t tot = 0; | |
| int *nm = NULL; // = malloc(1e8 * 4); | |
| int res = posix_memalign((void **)&nm, 16, 1e8 * 4); | |
| if (res != 0) return res; | |
| f(1e8) { | |
| nm[i] = 1; //rand() % 2; | |
| } | |
| ust(); | |
| // f(1e8) { | |
| // tot += fastrangei32(nm[i], 5); | |
| // } | |
| // int tmp[] = {0, 0, 0, 0}; | |
| // __m128i sum = _mm_load_si128 ((__m128i *)&tmp); | |
| // for (register int i = 0; i < 1e8; i += 8) { | |
| // __m128i v1 = _mm_load_si128 ((__m128i *)&nm[i]); | |
| // __m128i v2 = _mm_load_si128 ((__m128i *)&nm[i + 4]); | |
| // __m128i res = _mm_add_epi32 (v1, v2); | |
| // sum = _mm_add_epi32 (sum, res); | |
| // } | |
| // const __m128i vk0 = _mm_set1_epi8(0); // constant vector of all 0s for use with _mm_unpacklo_epi8/_mm_unpackhi_epi8 | |
| // const __m128i vk1 = _mm_set1_epi16(1); // constant vector of all 1s for use with _mm_madd_epi16 | |
| // __m128i vsum = _mm_set1_epi32(0); // initialise vector of four partial 32 bit sums | |
| // uint32_t sum; | |
| // int i; | |
| // for (i = 0; i < 1e8; i += 4) | |
| // { | |
| // __m128i v = _mm_load_si128((__m128i *)&nm[i]); // load vector of 8 bit values | |
| // vsum = _mm_add_epi32 (v, vsum); | |
| // // __m128i vl = _mm_unpacklo_epi8(v, vk0); // unpack to two vectors of 16 bit values | |
| // // __m128i vh = _mm_unpackhi_epi8(v, vk0); | |
| // // vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vl, vk1)); | |
| // // vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vh, vk1)); | |
| // // // unpack and accumulate 16 bit values to | |
| // // // 32 bit partial sum vector | |
| // } | |
| // // horizontal add of four 32 bit partial sums and return result | |
| // vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); | |
| // vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); | |
| // //sum = _mm_cvtsi128_si32(vsum); | |
| //////// CALISAN SSE SIMD KODU | |
| // int storeSIMDed[4]; | |
| // int i; | |
| // int j = 1e8 / 4; | |
| // __m128i sum = _mm_setzero_si128(); | |
| // __m128i operand; | |
| // for (i = 0; i < j; i++) { | |
| // operand = _mm_loadu_si128((__m128i *) nm + i); | |
| // sum = _mm_add_epi32(operand, sum); | |
| // } | |
| // _mm_storeu_si128((__m128i *) storeSIMDed, sum); | |
| int sumResult = 0; | |
| // for (i = 0; i < 4; i++) { | |
| // sumResult += storeSIMDed[i]; | |
| // } | |
| //////// CALISAN SSE SIMD KODU | |
| // for (i = j * 4; i < n; i++) { | |
| // sumResult = sumResult + nm[i]; | |
| // } | |
| f(1e8) { | |
| sumResult += nm[i]; | |
| } | |
| // long long resx = _mm_cvtsi128_si64(vsum); | |
| uend("iterating 1m"); | |
| printf("tot %d\n", sumResult); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment