Created
May 19, 2017 12:19
-
-
Save morris821028/17ab44dc4a3e4f2ac00b9d12525a66bc to your computer and use it in GitHub Desktop.
dynamic 2d range sum
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <inttypes.h> | |
#include <assert.h> | |
#include <x86intrin.h> | |
typedef struct Rect { | |
int32_t lx, ly, rx, ry; | |
} Rect; | |
static int32_t linear_search(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) { | |
int32_t ret = 0; | |
for (int i = 0; i < n; i++) { | |
if (rect.lx <= x[i] && x[i] <= rect.rx && | |
rect.ly <= y[i] && y[i] <= rect.ry) { | |
ret += w[i]; | |
} | |
} | |
return ret; | |
} | |
static int32_t linear_search_lp(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) { | |
int32_t ret = 0; | |
#define UNLOOP {if (rect.lx <= x[i] && x[i] <= rect.rx && rect.ly <= y[i] && y[i] <= rect.ry) { ret += w[i]; } i++;} | |
#define UNLOOP4 {UNLOOP UNLOOP UNLOOP UNLOOP} | |
#define UNLOOP8 {UNLOOP4 UNLOOP4} | |
int i; | |
for (; i+8 <= n; ) | |
UNLOOP8; | |
for (; i < n;) | |
UNLOOP; | |
return ret; | |
} | |
void printall(__m128i t) { | |
static int32_t tmp[4] __attribute__ ((aligned (16))); | |
_mm_store_si128((__m128i*) &tmp[0], t); | |
printf("printall %d %d %d %d\n", tmp[0], tmp[1], tmp[2], tmp[3]); | |
} | |
static int32_t linear_search_opt(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) { | |
__m128i ret = _mm_set_epi32(0, 0, 0, 0); | |
rect.lx--, rect.ly--; | |
rect.rx++, rect.ry++; | |
__m128i lx = _mm_broadcastd_epi32(*((__m128i *) &rect.lx)); | |
__m128i ly = _mm_broadcastd_epi32(*((__m128i *) &rect.ly)); | |
__m128i rx = _mm_broadcastd_epi32(*((__m128i *) &rect.rx)); | |
__m128i ry = _mm_broadcastd_epi32(*((__m128i *) &rect.ry)); | |
__m128i zo = _mm_set_epi32(0, 0, 0, 0); | |
__m128i ic = _mm_set_epi32(3, 2, 1, 0); | |
/* | |
puts("===================================="); | |
{ | |
static int32_t tmp[4] __attribute__ ((aligned (16))); | |
_mm_store_si128((__m128i*) &tmp[0], lx); | |
printf("cc %d\n", rect.lx); | |
for (int i = 0; i < 4; i++) | |
printf("%d %d\n", i, tmp[i]); | |
} | |
*/ | |
for (int i = 0; i+4 <= n; i += 4) { | |
__m128i sx = _mm_load_si128((__m128i *) (x+i)); | |
__m128i sy = _mm_load_si128((__m128i *) (y+i)); | |
__m128i c1 = _mm_and_si128(_mm_cmplt_epi32(lx, sx), _mm_cmplt_epi32(sx, rx)); | |
__m128i c2 = _mm_and_si128(_mm_cmplt_epi32(ly, sy), _mm_cmplt_epi32(sy, ry)); | |
if (_mm_testz_si128(c1, c2) == 0) { | |
__m128i cc = _mm_and_si128(c1, c2); | |
__m128i vi = _mm_add_epi32(ic, _mm_set_epi32(i, i, i, i)); | |
__m128i rs = _mm_mask_i32gather_epi32(zo, w+i, ic, cc, 4); | |
//printf("found some one [%d %d] %d %d %d %d\n", i, i+3, w[i], w[i+1], w[i+2], w[i+3]); | |
//printall(rs); | |
//printall(zo); | |
ret = _mm_add_epi32(ret, rs); | |
} | |
} | |
int32_t sum = 0; | |
for (int i = (n>>2)<<2; i < n; i++) { | |
if (rect.lx <= x[i] && x[i] <= rect.rx && | |
rect.ly <= y[i] && y[i] <= rect.ry) { | |
sum += w[i]; | |
} | |
} | |
static int32_t tmp[4] __attribute__ ((aligned (16))); | |
_mm_store_si128((__m128i*) &tmp[0], ret); | |
sum += tmp[0] + tmp[1] + tmp[2] + tmp[3]; | |
return sum; | |
} | |
static void swap(int *x, int *y) { | |
int tmp = *x; | |
*x = *y; | |
*y = tmp; | |
} | |
static Rect rand_rect() { | |
Rect r; | |
r.lx = rand()%1000; | |
r.ly = rand()%1000; | |
r.rx = rand()%1000; | |
r.ry = rand()%1000; | |
if (r.lx > r.rx) swap(&r.lx, &r.rx); | |
if (r.ly > r.ry) swap(&r.ly, &r.ry); | |
assert(r.lx <= r.rx); | |
assert(r.ly <= r.ry); | |
return r; | |
} | |
int main() { | |
srand(0); | |
#define MAXN 1232 | |
static int32_t x[MAXN], y[MAXN], w[MAXN]; | |
int n = MAXN; | |
for (int i = 0; i < n; i++) { | |
x[i] = rand()%1000; | |
y[i] = rand()%1000; | |
w[i] = rand()%1000; | |
} | |
int32_t hash = 0; | |
for (int it = 0; it < 10000; it++) { | |
Rect rect = rand_rect(); | |
// int32_t ret = linear_search(rect, x, y, w, n); | |
int32_t ret = linear_search_lp(rect, x, y, w, n); | |
// int32_t ret = linear_search_opt(rect, x, y, w, n); | |
hash ^= ret; | |
} | |
printf("%" PRIi32 "\n", hash); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment