Skip to content

Instantly share code, notes, and snippets.

@morris821028
Created May 19, 2017 12:19
Show Gist options
  • Save morris821028/17ab44dc4a3e4f2ac00b9d12525a66bc to your computer and use it in GitHub Desktop.
Save morris821028/17ab44dc4a3e4f2ac00b9d12525a66bc to your computer and use it in GitHub Desktop.
dynamic 2d range sum
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <assert.h>
#include <x86intrin.h>
typedef struct Rect {
int32_t lx, ly, rx, ry;
} Rect;
static int32_t linear_search(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) {
int32_t ret = 0;
for (int i = 0; i < n; i++) {
if (rect.lx <= x[i] && x[i] <= rect.rx &&
rect.ly <= y[i] && y[i] <= rect.ry) {
ret += w[i];
}
}
return ret;
}
static int32_t linear_search_lp(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) {
int32_t ret = 0;
#define UNLOOP {if (rect.lx <= x[i] && x[i] <= rect.rx && rect.ly <= y[i] && y[i] <= rect.ry) { ret += w[i]; } i++;}
#define UNLOOP4 {UNLOOP UNLOOP UNLOOP UNLOOP}
#define UNLOOP8 {UNLOOP4 UNLOOP4}
int i;
for (; i+8 <= n; )
UNLOOP8;
for (; i < n;)
UNLOOP;
return ret;
}
void printall(__m128i t) {
static int32_t tmp[4] __attribute__ ((aligned (16)));
_mm_store_si128((__m128i*) &tmp[0], t);
printf("printall %d %d %d %d\n", tmp[0], tmp[1], tmp[2], tmp[3]);
}
static int32_t linear_search_opt(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) {
__m128i ret = _mm_set_epi32(0, 0, 0, 0);
rect.lx--, rect.ly--;
rect.rx++, rect.ry++;
__m128i lx = _mm_broadcastd_epi32(*((__m128i *) &rect.lx));
__m128i ly = _mm_broadcastd_epi32(*((__m128i *) &rect.ly));
__m128i rx = _mm_broadcastd_epi32(*((__m128i *) &rect.rx));
__m128i ry = _mm_broadcastd_epi32(*((__m128i *) &rect.ry));
__m128i zo = _mm_set_epi32(0, 0, 0, 0);
__m128i ic = _mm_set_epi32(3, 2, 1, 0);
/*
puts("====================================");
{
static int32_t tmp[4] __attribute__ ((aligned (16)));
_mm_store_si128((__m128i*) &tmp[0], lx);
printf("cc %d\n", rect.lx);
for (int i = 0; i < 4; i++)
printf("%d %d\n", i, tmp[i]);
}
*/
for (int i = 0; i+4 <= n; i += 4) {
__m128i sx = _mm_load_si128((__m128i *) (x+i));
__m128i sy = _mm_load_si128((__m128i *) (y+i));
__m128i c1 = _mm_and_si128(_mm_cmplt_epi32(lx, sx), _mm_cmplt_epi32(sx, rx));
__m128i c2 = _mm_and_si128(_mm_cmplt_epi32(ly, sy), _mm_cmplt_epi32(sy, ry));
if (_mm_testz_si128(c1, c2) == 0) {
__m128i cc = _mm_and_si128(c1, c2);
__m128i vi = _mm_add_epi32(ic, _mm_set_epi32(i, i, i, i));
__m128i rs = _mm_mask_i32gather_epi32(zo, w+i, ic, cc, 4);
//printf("found some one [%d %d] %d %d %d %d\n", i, i+3, w[i], w[i+1], w[i+2], w[i+3]);
//printall(rs);
//printall(zo);
ret = _mm_add_epi32(ret, rs);
}
}
int32_t sum = 0;
for (int i = (n>>2)<<2; i < n; i++) {
if (rect.lx <= x[i] && x[i] <= rect.rx &&
rect.ly <= y[i] && y[i] <= rect.ry) {
sum += w[i];
}
}
static int32_t tmp[4] __attribute__ ((aligned (16)));
_mm_store_si128((__m128i*) &tmp[0], ret);
sum += tmp[0] + tmp[1] + tmp[2] + tmp[3];
return sum;
}
static void swap(int *x, int *y) {
int tmp = *x;
*x = *y;
*y = tmp;
}
static Rect rand_rect() {
Rect r;
r.lx = rand()%1000;
r.ly = rand()%1000;
r.rx = rand()%1000;
r.ry = rand()%1000;
if (r.lx > r.rx) swap(&r.lx, &r.rx);
if (r.ly > r.ry) swap(&r.ly, &r.ry);
assert(r.lx <= r.rx);
assert(r.ly <= r.ry);
return r;
}
int main() {
srand(0);
#define MAXN 1232
static int32_t x[MAXN], y[MAXN], w[MAXN];
int n = MAXN;
for (int i = 0; i < n; i++) {
x[i] = rand()%1000;
y[i] = rand()%1000;
w[i] = rand()%1000;
}
int32_t hash = 0;
for (int it = 0; it < 10000; it++) {
Rect rect = rand_rect();
// int32_t ret = linear_search(rect, x, y, w, n);
int32_t ret = linear_search_lp(rect, x, y, w, n);
// int32_t ret = linear_search_opt(rect, x, y, w, n);
hash ^= ret;
}
printf("%" PRIi32 "\n", hash);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment