Last active
December 8, 2015 10:45
-
-
Save amosr/8dba472064a9bbecd601 to your computer and use it in GitHub Desktop.
SSE int parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <x86intrin.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
static const uint32_t powers_of_ten_multipliers[] | |
= { 10000000, 1000000, 100000, 10000 | |
, 1000 , 100 , 10 , 1 | |
, 0 , 0 , 0 , 0 | |
, 0 , 0 , 0 , 0 | |
, 0 , 0 , 0 , 0 | |
, 0 , 0 , 0 , 0 | |
}; | |
static const uint32_t powers_of_ten[] | |
= { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000}; | |
static __m128i strip_ws(const char* in, const char** out) | |
{ | |
const __m128i white_range = _mm_setr_epi8(' ', '\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); | |
const __m128i m = _mm_loadu_si128((__m128i*)in); | |
const int index = _mm_cmpestri(white_range, 2, m, 16, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_NEGATIVE_POLARITY); | |
if (__builtin_expect(index == 0, 1)) { | |
return m; | |
} else if (index < 16) { | |
const char* in_new = in + index; | |
*out = in_new; | |
return _mm_loadu_si128((__m128i*)in_new); | |
} else { | |
return strip_ws(in + 16, out); | |
} | |
} | |
static unsigned int first_nondigit(const __m128i m) | |
{ | |
const __m128i digit_range = _mm_setr_epi8('0', '9', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); | |
const int index = _mm_cmpestri(digit_range, 2, m, 16, _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_NEGATIVE_POLARITY); | |
return index; | |
} | |
static unsigned int read64(const __m128i m, unsigned int index) | |
{ | |
const __m128i lows = _mm_cvtepu8_epi32(m); | |
const __m128i m_shifted = _mm_srli_si128(m, 4); | |
const __m128i highs = _mm_cvtepu8_epi32(m_shifted); | |
const uint32_t zero_words[] = { 48, 48, 48, 48 }; | |
const __m128i zeros = _mm_loadu_si128((__m128i*)zero_words); | |
const __m128i low_digits = lows - zeros; | |
const __m128i hi_digits = highs - zeros; | |
const int pow_ten_offset = (index < 8) ? 8 - index : 0; | |
const __m128i lo_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset)); | |
const __m128i hi_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset + 4)); | |
const __m128i lo_mulled = _mm_mullo_epi32(lo_muls, low_digits); | |
const __m128i hi_mulled = _mm_mullo_epi32(hi_muls, hi_digits); | |
const __m128i sum1 = _mm_hadd_epi32(lo_mulled, hi_mulled); | |
const __m128i sum2 = _mm_hadd_epi32(sum1, sum1); | |
const __m128i sum3 = _mm_hadd_epi32(sum2, sum2); | |
return _mm_extract_epi32(sum3, 0); | |
} | |
void read_int(const char* in, const char** out_end, uint64_t* out_val) | |
{ | |
__m128i m = strip_ws(in, &in); | |
uint64_t sign = 1; | |
if (_mm_extract_epi8(m, 0) == '-') { | |
in++; | |
m = _mm_loadu_si128((__m128i*)in); | |
sign = -1; | |
} | |
unsigned int index = first_nondigit(m); | |
uint64_t int_out = read64(m, index); | |
if (__builtin_expect(index > 8, 0)) { | |
const __m128i m_shift = _mm_srli_si128(m, 8); | |
uint64_t i2 = read64(m_shift, index - 8); | |
uint64_t mul = powers_of_ten[index - 8]; | |
int_out = int_out * mul + i2; | |
if (__builtin_expect(index == 16, 0)) { | |
in += 16; | |
const __m128i m_left = _mm_loadu_si128((__m128i*)in); | |
index = first_nondigit(m_left); | |
if (__builtin_expect(index > 3, 0)) { | |
// fprintf(stderr, "ERROR NUMBER TOO BIG\n"); | |
} | |
uint64_t i3 = read64(m_left, index); | |
int_out = int_out * powers_of_ten[index] + i3; | |
} | |
} | |
*out_val = int_out * sign; | |
*out_end = in + index; | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.section __TEXT,__const | |
.align 4 | |
LCPI0_0: | |
.byte 32 ## 0x20 | |
.byte 9 ## 0x9 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
LCPI0_1: | |
.byte 48 ## 0x30 | |
.byte 57 ## 0x39 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
.byte 0 ## 0x0 | |
LCPI0_2: | |
.quad -206158430256 ## 0xffffffcfffffffd0 | |
.quad -206158430256 ## 0xffffffcfffffffd0 | |
.section __TEXT,__text,regular,pure_instructions | |
.globl _read_int | |
.align 4, 0x90 | |
_read_int: ## @read_int | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp3: | |
.cfi_def_cfa_offset 16 | |
Ltmp4: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp5: | |
.cfi_def_cfa_register %rbp | |
pushq %rbx | |
Ltmp6: | |
.cfi_offset %rbx, -24 | |
movq %rdx, %r8 | |
vmovdqu (%rdi), %xmm0 | |
vmovdqa LCPI0_0(%rip), %xmm1 | |
movl $2, %eax | |
movl $16, %edx | |
vpcmpestri $16, %xmm0, %xmm1 | |
testl %ecx, %ecx | |
jne LBB0_1 | |
LBB0_5: ## %strip_ws.exit | |
vpextrb $0, %xmm0, %eax | |
movl $1, %r9d | |
cmpl $45, %eax | |
jne LBB0_7 | |
## BB#6: | |
vmovdqu 1(%rdi), %xmm0 | |
incq %rdi | |
movq $-1, %r9 | |
LBB0_7: | |
vmovdqa LCPI0_1(%rip), %xmm1 | |
movl $2, %eax | |
movl $16, %edx | |
vpcmpestri $20, %xmm0, %xmm1 | |
## kill: ECX<def> ECX<kill> RCX<def> | |
vpmovzxbd %xmm0, %xmm3 | |
vpsrldq $4, %xmm0, %xmm2 | |
vpmovzxbd %xmm2, %xmm4 | |
vmovdqa LCPI0_2(%rip), %xmm2 | |
vpaddq %xmm2, %xmm3, %xmm3 | |
vpaddq %xmm2, %xmm4, %xmm4 | |
xorl %eax, %eax | |
cmpl $7, %ecx | |
ja LBB0_9 | |
## BB#8: | |
movl $8, %eax | |
subl %ecx, %eax | |
cltq | |
LBB0_9: ## %read64.exit13 | |
leaq _powers_of_ten_multipliers(%rip), %r11 | |
vpmulld (%r11,%rax,4), %xmm3, %xmm3 | |
vpmulld 16(%r11,%rax,4), %xmm4, %xmm4 | |
vphaddd %xmm4, %xmm3, %xmm3 | |
vphaddd %xmm3, %xmm3, %xmm3 | |
vphaddd %xmm3, %xmm3, %xmm3 | |
vmovd %xmm3, %r10d | |
cmpl $9, %ecx | |
jae LBB0_10 | |
LBB0_16: | |
imulq %r9, %r10 | |
movq %r10, (%r8) | |
movl %ecx, %eax | |
addq %rdi, %rax | |
movq %rax, (%rsi) | |
popq %rbx | |
popq %rbp | |
retq | |
LBB0_1: | |
movq %rdi, %rbx | |
LBB0_3: ## %.lr.ph.i | |
## =>This Inner Loop Header: Depth=1 | |
cmpl $15, %ecx | |
jle LBB0_4 | |
## BB#2: ## %tailrecurse.i | |
## in Loop: Header=BB0_3 Depth=1 | |
vmovdqu 16(%rbx), %xmm0 | |
addq $16, %rbx | |
movl $2, %eax | |
movl $16, %edx | |
vpcmpestri $16, %xmm0, %xmm1 | |
testl %ecx, %ecx | |
je LBB0_5 | |
jmp LBB0_3 | |
LBB0_10: | |
vpsrldq $8, %xmm0, %xmm0 | |
leal -8(%rcx), %eax | |
vpmovzxbd %xmm0, %xmm3 | |
vpsrldq $4, %xmm0, %xmm0 | |
vpmovzxbd %xmm0, %xmm4 | |
vpaddq %xmm2, %xmm3, %xmm0 | |
vpaddq %xmm2, %xmm4, %xmm3 | |
xorl %edx, %edx | |
cmpl $7, %eax | |
ja LBB0_12 | |
## BB#11: | |
movl $16, %edx | |
subl %ecx, %edx | |
movslq %edx, %rdx | |
LBB0_12: ## %read64.exit8 | |
vpmulld (%r11,%rdx,4), %xmm0, %xmm0 | |
vpmulld 16(%r11,%rdx,4), %xmm3, %xmm3 | |
vphaddd %xmm3, %xmm0, %xmm0 | |
vphaddd %xmm0, %xmm0, %xmm0 | |
vphaddd %xmm0, %xmm0, %xmm0 | |
vmovd %xmm0, %edx | |
movl %eax, %eax | |
leaq _powers_of_ten(%rip), %rbx | |
movl (%rbx,%rax,4), %eax | |
imulq %rax, %r10 | |
addq %rdx, %r10 | |
cmpl $16, %ecx | |
jne LBB0_16 | |
## BB#13: | |
vmovdqu 16(%rdi), %xmm0 | |
movl $2, %eax | |
movl $16, %edx | |
vpcmpestri $20, %xmm0, %xmm1 | |
## kill: ECX<def> ECX<kill> RCX<def> | |
vpmovzxbd %xmm0, %xmm1 | |
vpsrldq $4, %xmm0, %xmm0 | |
vpmovzxbd %xmm0, %xmm3 | |
vpaddq %xmm2, %xmm1, %xmm0 | |
vpaddq %xmm2, %xmm3, %xmm1 | |
xorl %eax, %eax | |
cmpl $7, %ecx | |
ja LBB0_15 | |
## BB#14: | |
movl $8, %eax | |
subl %ecx, %eax | |
cltq | |
LBB0_15: ## %read64.exit | |
addq $16, %rdi | |
vpmulld (%r11,%rax,4), %xmm0, %xmm0 | |
vpmulld 16(%r11,%rax,4), %xmm1, %xmm1 | |
vphaddd %xmm1, %xmm0, %xmm0 | |
vphaddd %xmm0, %xmm0, %xmm0 | |
vphaddd %xmm0, %xmm0, %xmm0 | |
vmovd %xmm0, %eax | |
movl %ecx, %edx | |
movl (%rbx,%rdx,4), %edx | |
imulq %r10, %rdx | |
addq %rax, %rdx | |
movq %rdx, %r10 | |
jmp LBB0_16 | |
LBB0_4: | |
movslq %ecx, %rax | |
leaq (%rbx,%rax), %rdi | |
vmovdqu (%rbx,%rax), %xmm0 | |
jmp LBB0_5 | |
.cfi_endproc | |
.section __TEXT,__const | |
.align 4 ## @powers_of_ten | |
_powers_of_ten: | |
.long 1 ## 0x1 | |
.long 10 ## 0xa | |
.long 100 ## 0x64 | |
.long 1000 ## 0x3e8 | |
.long 10000 ## 0x2710 | |
.long 100000 ## 0x186a0 | |
.long 1000000 ## 0xf4240 | |
.long 10000000 ## 0x989680 | |
.long 100000000 ## 0x5f5e100 | |
.align 4 ## @powers_of_ten_multipliers | |
_powers_of_ten_multipliers: | |
.long 10000000 ## 0x989680 | |
.long 1000000 ## 0xf4240 | |
.long 100000 ## 0x186a0 | |
.long 10000 ## 0x2710 | |
.long 1000 ## 0x3e8 | |
.long 100 ## 0x64 | |
.long 10 ## 0xa | |
.long 1 ## 0x1 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.long 0 ## 0x0 | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment