Created
March 7, 2018 00:04
-
-
Save rygorous/bf1659bf6cd1752ed114367d4b87b302 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; NOTE: all just a quick sketch | |
; also I'm using xmm/ymm/zmm0-5 and then 16-31 | |
; because Win64 ABI expects me to preserve ymm6-ymm15 | |
; and I couldn't be bothered to set up a stack frame and save them :) | |
; | |
; Meant to be assembled with NASM | |
section .data | |
vone dd 1 | |
vtwo dd 2 | |
v31 dd 31 | |
vneg1 dd -1 | |
vmaskb dd 255 | |
vbase dd 0*256,1*256,2*256,3*256,4*256,5*256,6*256,7*256 | |
section .text | |
; ---- Win64 ABI: arguments in rcx,rdx,r8 | |
global histo_asm_scalar4_core | |
histo_asm_scalar4_core: | |
mov r9, r8 ; original count | |
shr r8, 2 ; trip count | |
jz .tail | |
push rbx | |
; Scalar (4x) | |
; This is super-simple: do a single 64b (4B) scalar load | |
; and keep extracting bytes, shifting and incrementing the | |
; respectively histogram bin value. | |
; | |
; Keep 4 separate histograms: each 256-element histogram | |
; is 1k, so this is 4kB total; memory disambiguation for | |
; store->load forwarding only checks the bottom 12 bits | |
; of addresses on older Intel uArchs, so this is the largest | |
; number of separate histograms we can keep that will not | |
; result in false dependencies on these architectures. | |
; In either case, 4 histograms gets essentially all of the | |
; benefit of having more histograms. | |
; | |
; I tried using "movzx eax, bh" and only shifting once | |
; for every pair of bytes, but that was slightly more | |
; expensive on my SKX workstation. Similar with using | |
; a "movzx eax, bl" / "movzx eax, bh" / "bswap ebx" / | |
; "movzx eax, bh" / "movzx eax, bl" combo. | |
; | |
; Haven't investigated this further. | |
.inner: | |
mov ebx, [rdx] | |
add rdx, 4 | |
movzx eax, bl | |
shr ebx, 8 | |
add dword [rcx + rax*4 + 0*1024], 1 | |
movzx eax, bl | |
shr ebx, 8 | |
add dword [rcx + rax*4 + 1*1024], 1 | |
movzx eax, bl | |
add dword [rcx + rax*4 + 2*1024], 1 | |
shr ebx, 8 | |
add dword [rcx + rbx*4 + 3*1024], 1 | |
dec r8 | |
jnz .inner | |
pop rbx | |
.tail: | |
and r9, 3 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
ret | |
; ---- Win64 ABI: arguments in rcx,rdx,r8 | |
global histo_asm_scalar8_core | |
histo_asm_scalar8_core: | |
mov r9, r8 ; original count | |
shr r8, 3 ; trip count | |
jz .tail | |
push rbx | |
; Scalar (8x) | |
; Using a 64b (8B) load this time, but still 4 histogram | |
; slices. | |
.inner: | |
mov rbx, [rdx] | |
add rdx, 8 | |
movzx eax, bl | |
shr rbx, 8 | |
add dword [rcx + rax*4 + 0*1024], 1 | |
movzx eax, bl | |
shr rbx, 8 | |
add dword [rcx + rax*4 + 1*1024], 1 | |
movzx eax, bl | |
shr rbx, 8 | |
add dword [rcx + rax*4 + 2*1024], 1 | |
movzx eax, bl | |
shr rbx, 8 | |
add dword [rcx + rax*4 + 3*1024], 1 | |
movzx eax, bl | |
shr ebx, 8 | |
add dword [rcx + rax*4 + 0*1024], 1 | |
movzx eax, bl | |
shr ebx, 8 | |
add dword [rcx + rax*4 + 1*1024], 1 | |
movzx eax, bl | |
add dword [rcx + rax*4 + 2*1024], 1 | |
shr ebx, 8 | |
add dword [rcx + rbx*4 + 3*1024], 1 | |
dec r8 | |
jnz .inner | |
pop rbx | |
.tail: | |
and r9, 7 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
ret | |
; ---- | |
global histo_asm_sse4_core | |
histo_asm_sse4_core: | |
mov r9, r8 ; original count | |
shr r8, 4 ; trip count | |
jz .tail | |
; SSE4 | |
; Same idea as before, still 4 histogram slices, | |
; but use a 16-byte SSE load and PEXTRB. | |
; | |
; This should not make much of a difference to the | |
; above (PEXTRB is 2 uops on the Intel uArchs I looked at, | |
; same as the movzx/shift pairs above), and indeed, in my | |
; tests anyway, it doesn't. | |
.inner: | |
movdqu xmm0, [rdx] | |
add rdx, 16 | |
%assign i 0 | |
%rep 16 | |
pextrb eax, xmm0, i | |
add dword [rcx + rax*4 + (i&3)*256*4], 1 | |
%assign i i+1 | |
%endrep | |
dec r8 | |
jnz .inner | |
.tail: | |
and r9, 15 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
ret | |
; ---- | |
global histo_asm_avx256_8x_core1 | |
histo_asm_avx256_8x_core1: | |
vzeroupper | |
mov r9, r8 ; original count | |
shr r8, 5 ; trip count | |
jz .tail | |
vpbroadcastd ymm3, [rel vone] | |
vmovdqu ymm4, [rel vbase] | |
vpbroadcastd ymm5, [rel vmaskb] | |
; "AVX-256", variant 1. | |
; | |
; This is AVX-512, but "only" using 256-bit wide vectors | |
; we're not using "heavy" instructions, so 256-bit stays | |
; in the highest frequency level license on SKX. | |
; (The reason to use AVX-512 is because we need both | |
; gathers and scatters.) | |
; | |
; This avoids collisions between the 8 lanes by using | |
; 8 separate histograms. This would be bad on older | |
; uArchs with 4k store->load aliasing, but those | |
; certainly don't support AVX-512, so it's not a | |
; concern. | |
.inner: | |
; We treat these 32 bytes as 8 DWords | |
vmovdqu ymm0, [rdx] | |
add rdx, 32 | |
; Extract bottom byte of every lane | |
vpandd ymm1, ymm0, ymm5 | |
; Add base so the lanes all go into different histograms | |
; (so they never conflict) | |
vpaddd ymm1, ymm1, ymm4 | |
; Dependency breaker | |
vpxord ymm2, ymm2, ymm2 | |
; Gather the histo bin values | |
kxnorb k1, k1, k1 | |
vpgatherdd ymm2{k1}, [rcx + ymm1*4] | |
; Increment | |
vpaddd ymm2, ymm2, ymm3 | |
; Scatter updated values | |
kxnorb k1, k1, k1 | |
vpscatterdd [rcx + ymm1*4]{k1}, ymm2 | |
; Bits [15:8] of every lane | |
vpsrld ymm1, ymm0, 8 | |
vpandd ymm1, ymm1, ymm5 | |
vpaddd ymm1, ymm1, ymm4 | |
vpxord ymm2, ymm2, ymm2 | |
kxnorb k1, k1, k1 | |
vpgatherdd ymm2{k1}, [rcx + ymm1*4] | |
vpaddd ymm2, ymm2, ymm3 | |
kxnorb k1, k1, k1 | |
vpscatterdd [rcx + ymm1*4]{k1}, ymm2 | |
; Bits [23:16] of every lane | |
vpsrld ymm1, ymm0, 16 | |
vpandd ymm1, ymm1, ymm5 | |
vpaddd ymm1, ymm1, ymm4 | |
vpxord ymm2, ymm2, ymm2 | |
kxnorb k1, k1, k1 | |
vpgatherdd ymm2{k1}, [rcx + ymm1*4] | |
vpaddd ymm2, ymm2, ymm3 | |
kxnorb k1, k1, k1 | |
vpscatterdd [rcx + ymm1*4]{k1}, ymm2 | |
; Bits [31:24] of every lane | |
vpsrld ymm1, ymm0, 24 | |
vpaddd ymm1, ymm1, ymm4 | |
vpxord ymm2, ymm2, ymm2 | |
kxnorb k1, k1, k1 | |
vpgatherdd ymm2{k1}, [rcx + ymm1*4] | |
vpaddd ymm2, ymm2, ymm3 | |
kxnorb k1, k1, k1 | |
vpscatterdd [rcx + ymm1*4]{k1}, ymm2 | |
dec r8 | |
jnz .inner | |
.tail: | |
and r9, 31 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
vzeroupper | |
ret | |
; ---- | |
global histo_asm_avx256_8x_core2 | |
histo_asm_avx256_8x_core2: | |
vzeroupper | |
mov r9, r8 ; original count | |
shr r8, 3 ; trip count | |
jz .tail | |
vpbroadcastd ymm3, [rel vone] | |
vmovdqu ymm4, [rel vbase] | |
; "AVX-256", variant 2 | |
; | |
; Only do a single gather/scatter per iteration, and | |
; use vpmovzxbd to upconvert the 8 bytes. | |
; | |
; This is definitely worse than variant 1. | |
.inner: | |
vpmovzxbd ymm0, [rdx] | |
add rdx, 8 | |
kxnorb k1, k1, k1 | |
vpaddd ymm0, ymm0, ymm4 | |
vpxord ymm2, ymm2, ymm2 | |
vpgatherdd ymm2{k1}, [rcx + ymm0*4] | |
vpaddd ymm2, ymm2, ymm3 | |
kxnorb k1, k1, k1 | |
vpscatterdd [rcx + ymm0*4]{k1}, ymm2 | |
dec r8 | |
jnz .inner | |
.tail: | |
and r9, 7 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
vzeroupper | |
ret | |
; ---- | |
global histo_asm_avx256_8x_core3 | |
histo_asm_avx256_8x_core3: | |
vzeroupper | |
mov r9, r8 ; original count | |
shr r8, 5 ; trip count | |
jz .tail | |
vpbroadcastd ymm3, [rel vone] | |
vpbroadcastd ymm17, [rel vtwo] | |
vmovdqu ymm4, [rel vbase] | |
vpbroadcastd ymm5, [rel vmaskb] | |
; "AVX-256", variant 3 | |
; | |
; This is similar to variant 1, but manually | |
; checks whether adjacent batches of 8 collide; | |
; if so, the first "hit" in the batch updates the | |
; histogram count by 2, and the second is disabled. | |
; The idea here is to avoid store->load forwarding | |
; cases by hand. | |
; | |
; This is sometimes marginally better than | |
; variant 1. | |
.inner: | |
vmovdqu ymm0, [rdx] | |
add rdx, 32 | |
vpandd ymm1, ymm0, ymm5 | |
vpsrld ymm16, ymm0, 8 | |
vpandd ymm16, ymm16, ymm5 | |
vpcmpd k2, ymm1, ymm16, 0 ; second iter matches first iter? | |
vpblendmd ymm18{k2}, ymm3, ymm17 ; second_matches ? 2 : 1 | |
vpaddd ymm1, ymm1, ymm4 | |
vpxord ymm2, ymm2, ymm2 | |
kxnorb k1, k1, k1 | |
vpgatherdd ymm2{k1}, [rcx + ymm1*4] | |
vpaddd ymm2, ymm2, ymm18 | |
kxnorb k1, k1, k1 | |
vpscatterdd [rcx + ymm1*4]{k1}, ymm2 | |
vpaddd ymm16, ymm16, ymm4 | |
vpxord ymm18, ymm18, ymm18 | |
knotb k1, k2 | |
vpgatherdd ymm18{k1}, [rcx + ymm16*4] | |
vpaddd ymm18, ymm18, ymm3 | |
knotb k1, k2 | |
vpscatterdd [rcx + ymm16*4]{k1}, ymm18 | |
vpsrld ymm1, ymm0, 16 | |
vpandd ymm1, ymm1, ymm5 | |
vpsrld ymm16, ymm0, 24 | |
vpcmpd k2, ymm1, ymm16, 0 ; second iter matches first iter? | |
vpblendmd ymm18{k2}, ymm3, ymm17 ; second_matches ? 2 : 1 | |
vpaddd ymm1, ymm1, ymm4 | |
vpxord ymm2, ymm2, ymm2 | |
kxnorb k1, k1, k1 | |
vpgatherdd ymm2{k1}, [rcx + ymm1*4] | |
vpaddd ymm2, ymm2, ymm18 | |
kxnorb k1, k1, k1 | |
vpscatterdd [rcx + ymm1*4]{k1}, ymm2 | |
vpaddd ymm16, ymm16, ymm4 | |
vpxord ymm2, ymm2, ymm2 | |
knotb k1, k2 | |
vpgatherdd ymm2{k1}, [rcx + ymm16*4] | |
vpaddd ymm2, ymm2, ymm3 | |
knotb k1, k2 | |
vpscatterdd [rcx + ymm16*4]{k1}, ymm2 | |
dec r8 | |
jnz .inner | |
.tail: | |
and r9, 31 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
vzeroupper | |
ret | |
; ---- | |
global histo_asm_avx512_core_conflict | |
histo_asm_avx512_core_conflict: | |
vzeroupper | |
mov r9, r8 ; original count | |
shr r8, 4 ; trip count | |
jz .tail | |
vpbroadcastd zmm4, [rel vone] | |
vpbroadcastd zmm5, [rel v31] | |
vpbroadcastd zmm17, [rel vneg1] | |
; AVX-512, conflict detect | |
; | |
; This is the algorithm from the Intel Optimization Manual (example 15-18) | |
; with typos fixed. | |
; | |
; This one only writes to a single histogram slice, which given how much | |
; extra work that implies is probably not a good idea. | |
.inner: | |
vpmovzxbd zmm0, [rdx] | |
add rdx, 16 | |
vpconflictd zmm1, zmm0 ; zmm1=conflicts | |
vmovaps zmm3, zmm4 ; vOne | |
vpxord zmm2, zmm2, zmm2 | |
kxnorw k1, k1, k1 | |
vpgatherdd zmm2{k1}, [rcx + zmm0*4] | |
vptestmd k1, zmm1, zmm1 | |
kortestw k1, k1 | |
jz .update | |
vplzcntd zmm1, zmm1 | |
vpsubd zmm1, zmm5, zmm1 | |
.conflicts: | |
vpermd zmm16{k1}{z}, zmm1, zmm3 | |
vpermd zmm1{k1}, zmm1, zmm1 | |
vpaddd zmm3{k1}, zmm3, zmm16 | |
vpcmpd k1, zmm1, zmm17, 4 | |
kortestw k1, k1 | |
jnz .conflicts | |
.update: | |
vpaddd zmm2, zmm2, zmm3 | |
kxnorw k1, k1, k1 | |
vpscatterdd [rcx + zmm0*4]{k1}, zmm2 | |
dec r8 | |
jnz .inner | |
.tail: | |
and r9, 15 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
vzeroupper | |
ret | |
; ---- | |
global histo_asm_avx512_core_8x | |
histo_asm_avx512_core_8x: | |
vzeroupper | |
mov r9, r8 ; original count | |
shr r8, 4 ; trip count | |
jz .tail | |
mov eax, 0xff | |
kmovw k3, eax ; "low half of lanes" mask | |
vbroadcasti32x8 zmm4, [rel vbase] | |
vpbroadcastd zmm5, [rel vone] | |
vpaddd zmm16, zmm5, zmm5 ; two | |
; AVX-512, 8 histogram slices | |
; | |
; This means we can do the conflict detection manually like in var3 above. | |
; That seems preferable overall, and is indeed significantly faster in | |
; my tests. | |
.inner: | |
vpmovzxbd zmm0, [rdx] | |
add rdx, 16 | |
vpaddd zmm1, zmm0, zmm4 ; add base lanes | |
valignd zmm2, zmm0, zmm0, 8 ; input bytes rotated by 8 lanes | |
vpcmpd k2{k3}, zmm0, zmm2, 0 ; check whether high half matches low half | |
; grab source bin values | |
vpxord zmm2, zmm2, zmm2 | |
kxnorw k1, k1, k1 | |
vpgatherdd zmm2{k1}, [rcx + zmm1*4] | |
; depending on whether we have a conflict between matching high and low lanes, | |
; add either 1 or 2 | |
vpblendmd zmm0{k2}, zmm5, zmm16 | |
vpaddd zmm2, zmm2, zmm0 | |
; determine output scatter mask | |
kshiftlw k1, k2, 8 | |
knotw k1, k1 | |
vpscatterdd [rcx + zmm1*4]{k1}, zmm2 | |
dec r8 | |
jnz .inner | |
.tail: | |
and r9, 15 ; masked count | |
jz .done | |
.taillp: | |
movzx eax, byte [rdx] | |
inc rdx | |
inc dword [rcx + rax*4] | |
dec r9 | |
jnz .taillp | |
.done: | |
vzeroupper | |
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This started from something more cross-platform but in its | |
// present version is only meant to compile with MSVC++ | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
#ifdef _MSC_VER | |
#include <intrin.h> | |
#endif | |
#include <emmintrin.h> | |
typedef uint8_t U8; | |
typedef uint32_t U32; | |
typedef uint64_t U64; | |
static U8 *read_file(const char *filename, size_t *out_size) | |
{ | |
U8 *ret = NULL; | |
FILE *f = fopen(filename, "rb"); | |
if (f) | |
{ | |
fseek(f, 0, SEEK_END); | |
long size = ftell(f); | |
fseek(f, 0, SEEK_SET); | |
U8 *mem = new U8[size]; | |
if (mem) | |
{ | |
if (fread(mem, size, 1, f) == 1) | |
{ | |
if (out_size) | |
*out_size = size; | |
ret = mem; | |
} | |
else | |
delete[] mem; | |
} | |
fclose(f); | |
} | |
return ret; | |
} | |
static inline U64 cycle_timer() | |
{ | |
#ifdef _MSC_VER | |
return __rdtsc(); | |
#else | |
U32 lo, hi; | |
__asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi) ); | |
return lo | ((U64)hi << 32); | |
#endif | |
} | |
static inline uint32_t read32(const void *p) | |
{ | |
uint32_t x; | |
memcpy(&x, p, sizeof(x)); | |
return x; | |
} | |
static void histo_ref(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
memset(counts,0,256*sizeof(U32)); | |
for (size_t i = 0; i < rawLen; i++) | |
counts[rawArray[i]]++; | |
} | |
static void histo_cpp_1x(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
memset(counts,0,256*sizeof(U32)); | |
const U8 * rawPtr = rawArray; | |
const U8 * rawEnd = rawArray+rawLen; | |
const U8 * rawEndMul4 = rawArray+(rawLen&~3); | |
while(rawPtr < rawEndMul4) | |
{ | |
U32 x = read32(rawPtr); | |
counts[x & 0xff]++; x >>= 8; | |
counts[x & 0xff]++; x >>= 8; | |
counts[x & 0xff]++; x >>= 8; | |
counts[x] ++; // last one doesn't need to mask | |
rawPtr += 4; | |
} | |
// finish the last few bytes (just throw them into array 0, doesn't matter) | |
while(rawPtr < rawEnd) | |
counts[ *rawPtr++ ] ++; | |
} | |
static void histo_cpp_2x(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[2][256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
const U8 * rawPtr = rawArray; | |
const U8 * rawEnd = rawArray+rawLen; | |
const U8 * rawEndMul4 = rawArray+(rawLen&~3); | |
while(rawPtr < rawEndMul4) | |
{ | |
U32 x = read32(rawPtr); | |
countsArray[0][x & 0xff]++; x >>= 8; | |
countsArray[1][x & 0xff]++; x >>= 8; | |
countsArray[0][x & 0xff]++; x >>= 8; | |
countsArray[1][x] ++; // last one doesn't need to mask | |
rawPtr += 4; | |
} | |
// finish the last few bytes (just throw them into array 0, doesn't matter) | |
while(rawPtr < rawEnd) | |
countsArray[0][ *rawPtr++ ] ++; | |
// sum the countsarrays together | |
int k=0; | |
for(;k<256;k+=4) | |
{ | |
__m128i sum = _mm_load_si128((const __m128i *) &countsArray[0][k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &countsArray[1][k]); | |
_mm_storeu_si128((__m128i *)&counts[k], sum); | |
} | |
} | |
static void finish_histo_4x(U32 * counts_out, const U32 * counts4x) | |
{ | |
for (size_t k=0; k<256; k+=4) | |
{ | |
__m128i sum = _mm_load_si128((const __m128i *) &counts4x[0*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts4x[1*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts4x[2*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts4x[3*256 + k]); | |
_mm_storeu_si128((__m128i *)&counts_out[k], sum); | |
} | |
} | |
static void finish_histo_8x(U32 * counts_out, const U32 * counts8x) | |
{ | |
for (size_t k=0; k<256; k+=4) | |
{ | |
__m128i sum = _mm_load_si128((const __m128i *) &counts8x[0*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[1*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[2*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[3*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[4*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[5*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[6*256 + k]); | |
sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[7*256 + k]); | |
_mm_storeu_si128((__m128i *)&counts_out[k], sum); | |
} | |
} | |
static void histo_cpp_4x(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[4*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
const U8 * rawPtr = rawArray; | |
const U8 * rawEnd = rawArray+rawLen; | |
const U8 * rawEndMul4 = rawArray+(rawLen&~3); | |
while(rawPtr < rawEndMul4) | |
{ | |
U32 x = read32(rawPtr); | |
countsArray[0*256 + (x & 0xff)]++; x >>= 8; | |
countsArray[1*256 + (x & 0xff)]++; x >>= 8; | |
countsArray[2*256 + (x & 0xff)]++; x >>= 8; | |
countsArray[3*256 + x] ++; // last one doesn't need to mask | |
rawPtr += 4; | |
} | |
// finish the last few bytes (just throw them into slice 0, doesn't matter) | |
while(rawPtr < rawEnd) | |
countsArray[ *rawPtr++ ] ++; | |
finish_histo_4x(counts, countsArray); | |
} | |
extern "C" void histo_asm_scalar4_core(U32 * histo, const U8 * bytes, size_t nbytes); | |
extern "C" void histo_asm_scalar8_core(U32 * histo, const U8 * bytes, size_t nbytes); | |
extern "C" void histo_asm_sse4_core(U32 * histo, const U8 * bytes, size_t nbytes); | |
static void histo_asm_scalar4(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[4*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
histo_asm_scalar4_core(countsArray, rawArray, rawLen); | |
finish_histo_4x(counts, countsArray); | |
} | |
static void histo_asm_scalar8(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[4*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
histo_asm_scalar8_core(countsArray, rawArray, rawLen); | |
finish_histo_4x(counts, countsArray); | |
} | |
static void histo_asm_sse4(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[4*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
histo_asm_sse4_core(countsArray, rawArray, rawLen); | |
finish_histo_4x(counts, countsArray); | |
} | |
extern "C" void histo_asm_avx256_8x_core1(U32 * histo, const U8 * bytes, size_t nbytes); | |
extern "C" void histo_asm_avx256_8x_core2(U32 * histo, const U8 * bytes, size_t nbytes); | |
extern "C" void histo_asm_avx256_8x_core3(U32 * histo, const U8 * bytes, size_t nbytes); | |
static void histo_asm_avx256_8x_1(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[8*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
histo_asm_avx256_8x_core1(countsArray, rawArray, rawLen); | |
finish_histo_8x(counts, countsArray); | |
} | |
static void histo_asm_avx256_8x_2(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[8*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
histo_asm_avx256_8x_core2(countsArray, rawArray, rawLen); | |
finish_histo_8x(counts, countsArray); | |
} | |
static void histo_asm_avx256_8x_3(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[8*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
histo_asm_avx256_8x_core3(countsArray, rawArray, rawLen); | |
finish_histo_8x(counts, countsArray); | |
} | |
extern "C" void histo_asm_avx512_core_conflict(U32 * histo, const U8 * bytes, size_t nbytes); | |
extern "C" void histo_asm_avx512_core_8x(U32 * histo, const U8 * bytes, size_t nbytes); | |
static void histo_asm_avx512_conflict(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
memset(counts,0,256*sizeof(U32)); | |
histo_asm_avx512_core_conflict(counts, rawArray, rawLen); | |
} | |
static void histo_asm_avx512_8x(U32 * counts, const U8 * rawArray, size_t rawLen) | |
{ | |
U32 __declspec(align(16)) countsArray[8*256]; | |
memset(countsArray,0,sizeof(countsArray)); | |
histo_asm_avx512_core_8x(countsArray, rawArray, rawLen); | |
finish_histo_8x(counts, countsArray); | |
} | |
typedef void histo_func(U32 * histo, const U8 * bytes, size_t nbytes); | |
U32 g_histo[256]; // global sink | |
static void testit(const char *name, histo_func *fn, const U8 * bytes, size_t nbytes) | |
{ | |
// check correctness | |
U32 ref_histo[256]; | |
U32 our_histo[256]; | |
histo_ref(ref_histo, bytes, nbytes); | |
fn(our_histo, bytes, nbytes); | |
if (memcmp(ref_histo, our_histo, sizeof(our_histo)) != 0) | |
{ | |
printf("%30s: incorrect result!\n", name); | |
return; | |
} | |
// check timing | |
U64 min_dur = ~(U64)0; | |
for (int run = 0; run < 7500; run++) | |
{ | |
U64 start = cycle_timer(); | |
fn(g_histo, bytes, nbytes); | |
U64 duration = cycle_timer() - start; | |
if (duration < min_dur) | |
min_dur = duration; | |
} | |
printf("%30s: %lld (%.2f/byte)\n", name, min_dur, 1.0*min_dur/nbytes); | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc < 2) | |
{ | |
fprintf(stderr, "Usage: histotest <filename>\n"); | |
return 1; | |
} | |
const char *filename = argv[1]; | |
size_t size; | |
U8 *bytes = read_file(filename, &size); | |
if (!bytes) | |
{ | |
fprintf(stderr, "Error loading '%s'\n", filename); | |
return 1; | |
} | |
printf("test file: %d bytes\n", (int)size); | |
#define TESTIT(name) testit(#name,name,bytes,size) | |
TESTIT(histo_cpp_1x); | |
TESTIT(histo_cpp_2x); | |
TESTIT(histo_cpp_4x); | |
TESTIT(histo_asm_scalar4); | |
TESTIT(histo_asm_scalar8); | |
TESTIT(histo_asm_sse4); | |
TESTIT(histo_asm_avx256_8x_1); | |
TESTIT(histo_asm_avx256_8x_2); | |
TESTIT(histo_asm_avx256_8x_3); | |
TESTIT(histo_asm_avx512_conflict); | |
TESTIT(histo_asm_avx512_8x); | |
delete[] bytes; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment