Skip to content

Instantly share code, notes, and snippets.

@fatihky
Created October 15, 2016 20:31
Show Gist options
  • Select an option

  • Save fatihky/8e650e9e116108a7154cccecb3439bdb to your computer and use it in GitHub Desktop.

Select an option

Save fatihky/8e650e9e116108a7154cccecb3439bdb to your computer and use it in GitHub Desktop.
sse ile sayi toplama - sum int array with sse simd - sse: 28ms, standard: 97ms
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <sys/wait.h>
#include <errno.h>
#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <arpa/inet.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <emmintrin.h>
#include "fr.h"
/* Return the UNIX time in microseconds */
long long ustime(void) {
struct timeval tv;
long long ust;
gettimeofday(&tv, NULL);
ust = ((long long)tv.tv_sec)*1000000;
ust += tv.tv_usec;
return ust;
}
/* Return the UNIX time in milliseconds */
#define mstime(void) (ustime() /1000)
// millisecond
static long long _m;
static long long _u;
// measure start
#define mst() _m = mstime()
#define ust() _u = ustime()
// measure end
#define mend(op) printf ("op %s took %lld ms\n", op, mstime() - _m)
#define uend(op) printf ("op %s took %lld us\n", op, ustime() - _u)
#define f(cnt) for (register int i = 0; i < cnt; i++)
#define fj(cnt) for (register int j = 0; j < cnt; j++)
#define fk(cnt) for (register int k = 0; k < cnt; k++)
static inline int32_t fastrangei32(int32_t word, int32_t p) {
return (int32_t)(((int64_t)word * (int64_t)p) >> 32);
}
int main(int argc, char *argv[]) {
uint64_t tot = 0;
int *nm = NULL; // = malloc(1e8 * 4);
int res = posix_memalign((void **)&nm, 16, 1e8 * 4);
if (res != 0) return res;
f(1e8) {
nm[i] = 1; //rand() % 2;
}
ust();
// f(1e8) {
// tot += fastrangei32(nm[i], 5);
// }
// int tmp[] = {0, 0, 0, 0};
// __m128i sum = _mm_load_si128 ((__m128i *)&tmp);
// for (register int i = 0; i < 1e8; i += 8) {
// __m128i v1 = _mm_load_si128 ((__m128i *)&nm[i]);
// __m128i v2 = _mm_load_si128 ((__m128i *)&nm[i + 4]);
// __m128i res = _mm_add_epi32 (v1, v2);
// sum = _mm_add_epi32 (sum, res);
// }
// const __m128i vk0 = _mm_set1_epi8(0); // constant vector of all 0s for use with _mm_unpacklo_epi8/_mm_unpackhi_epi8
// const __m128i vk1 = _mm_set1_epi16(1); // constant vector of all 1s for use with _mm_madd_epi16
// __m128i vsum = _mm_set1_epi32(0); // initialise vector of four partial 32 bit sums
// uint32_t sum;
// int i;
// for (i = 0; i < 1e8; i += 4)
// {
// __m128i v = _mm_load_si128((__m128i *)&nm[i]); // load vector of 8 bit values
// vsum = _mm_add_epi32 (v, vsum);
// // __m128i vl = _mm_unpacklo_epi8(v, vk0); // unpack to two vectors of 16 bit values
// // __m128i vh = _mm_unpackhi_epi8(v, vk0);
// // vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vl, vk1));
// // vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vh, vk1));
// // // unpack and accumulate 16 bit values to
// // // 32 bit partial sum vector
// }
// // horizontal add of four 32 bit partial sums and return result
// vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
// vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
// //sum = _mm_cvtsi128_si32(vsum);
//////// CALISAN SSE SIMD KODU
// int storeSIMDed[4];
// int i;
// int j = 1e8 / 4;
// __m128i sum = _mm_setzero_si128();
// __m128i operand;
// for (i = 0; i < j; i++) {
// operand = _mm_loadu_si128((__m128i *) nm + i);
// sum = _mm_add_epi32(operand, sum);
// }
// _mm_storeu_si128((__m128i *) storeSIMDed, sum);
int sumResult = 0;
// for (i = 0; i < 4; i++) {
// sumResult += storeSIMDed[i];
// }
//////// CALISAN SSE SIMD KODU
// for (i = j * 4; i < n; i++) {
// sumResult = sumResult + nm[i];
// }
f(1e8) {
sumResult += nm[i];
}
// long long resx = _mm_cvtsi128_si64(vsum);
uend("iterating 1m");
printf("tot %d\n", sumResult);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment