Skip to content

Instantly share code, notes, and snippets.

@amosr
Last active December 8, 2015 10:45
Show Gist options
  • Save amosr/8dba472064a9bbecd601 to your computer and use it in GitHub Desktop.
Save amosr/8dba472064a9bbecd601 to your computer and use it in GitHub Desktop.
SSE int parsing
#include <x86intrin.h>
#include <stdint.h>
#include <stdio.h>
static const uint32_t powers_of_ten_multipliers[]
= { 10000000, 1000000, 100000, 10000
, 1000 , 100 , 10 , 1
, 0 , 0 , 0 , 0
, 0 , 0 , 0 , 0
, 0 , 0 , 0 , 0
, 0 , 0 , 0 , 0
};
static const uint32_t powers_of_ten[]
= { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000};
static __m128i strip_ws(const char* in, const char** out)
{
const __m128i white_range = _mm_setr_epi8(' ', '\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
const __m128i m = _mm_loadu_si128((__m128i*)in);
const int index = _mm_cmpestri(white_range, 2, m, 16, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_NEGATIVE_POLARITY);
if (__builtin_expect(index == 0, 1)) {
return m;
} else if (index < 16) {
const char* in_new = in + index;
*out = in_new;
return _mm_loadu_si128((__m128i*)in_new);
} else {
return strip_ws(in + 16, out);
}
}
static unsigned int first_nondigit(const __m128i m)
{
const __m128i digit_range = _mm_setr_epi8('0', '9', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
const int index = _mm_cmpestri(digit_range, 2, m, 16, _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_NEGATIVE_POLARITY);
return index;
}
static unsigned int read64(const __m128i m, unsigned int index)
{
const __m128i lows = _mm_cvtepu8_epi32(m);
const __m128i m_shifted = _mm_srli_si128(m, 4);
const __m128i highs = _mm_cvtepu8_epi32(m_shifted);
const uint32_t zero_words[] = { 48, 48, 48, 48 };
const __m128i zeros = _mm_loadu_si128((__m128i*)zero_words);
const __m128i low_digits = lows - zeros;
const __m128i hi_digits = highs - zeros;
const int pow_ten_offset = (index < 8) ? 8 - index : 0;
const __m128i lo_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset));
const __m128i hi_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset + 4));
const __m128i lo_mulled = _mm_mullo_epi32(lo_muls, low_digits);
const __m128i hi_mulled = _mm_mullo_epi32(hi_muls, hi_digits);
const __m128i sum1 = _mm_hadd_epi32(lo_mulled, hi_mulled);
const __m128i sum2 = _mm_hadd_epi32(sum1, sum1);
const __m128i sum3 = _mm_hadd_epi32(sum2, sum2);
return _mm_extract_epi32(sum3, 0);
}
void read_int(const char* in, const char** out_end, uint64_t* out_val)
{
__m128i m = strip_ws(in, &in);
uint64_t sign = 1;
if (_mm_extract_epi8(m, 0) == '-') {
in++;
m = _mm_loadu_si128((__m128i*)in);
sign = -1;
}
unsigned int index = first_nondigit(m);
uint64_t int_out = read64(m, index);
if (__builtin_expect(index > 8, 0)) {
const __m128i m_shift = _mm_srli_si128(m, 8);
uint64_t i2 = read64(m_shift, index - 8);
uint64_t mul = powers_of_ten[index - 8];
int_out = int_out * mul + i2;
if (__builtin_expect(index == 16, 0)) {
in += 16;
const __m128i m_left = _mm_loadu_si128((__m128i*)in);
index = first_nondigit(m_left);
if (__builtin_expect(index > 3, 0)) {
// fprintf(stderr, "ERROR NUMBER TOO BIG\n");
}
uint64_t i3 = read64(m_left, index);
int_out = int_out * powers_of_ten[index] + i3;
}
}
*out_val = int_out * sign;
*out_end = in + index;
}
.section __TEXT,__text,regular,pure_instructions
.section __TEXT,__const
.align 4
LCPI0_0:
.byte 32 ## 0x20
.byte 9 ## 0x9
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
LCPI0_1:
.byte 48 ## 0x30
.byte 57 ## 0x39
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
.byte 0 ## 0x0
LCPI0_2:
.quad -206158430256 ## 0xffffffcfffffffd0
.quad -206158430256 ## 0xffffffcfffffffd0
.section __TEXT,__text,regular,pure_instructions
.globl _read_int
.align 4, 0x90
_read_int: ## @read_int
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq %rbx
Ltmp6:
.cfi_offset %rbx, -24
movq %rdx, %r8
vmovdqu (%rdi), %xmm0
vmovdqa LCPI0_0(%rip), %xmm1
movl $2, %eax
movl $16, %edx
vpcmpestri $16, %xmm0, %xmm1
testl %ecx, %ecx
jne LBB0_1
LBB0_5: ## %strip_ws.exit
vpextrb $0, %xmm0, %eax
movl $1, %r9d
cmpl $45, %eax
jne LBB0_7
## BB#6:
vmovdqu 1(%rdi), %xmm0
incq %rdi
movq $-1, %r9
LBB0_7:
vmovdqa LCPI0_1(%rip), %xmm1
movl $2, %eax
movl $16, %edx
vpcmpestri $20, %xmm0, %xmm1
## kill: ECX<def> ECX<kill> RCX<def>
vpmovzxbd %xmm0, %xmm3
vpsrldq $4, %xmm0, %xmm2
vpmovzxbd %xmm2, %xmm4
vmovdqa LCPI0_2(%rip), %xmm2
vpaddq %xmm2, %xmm3, %xmm3
vpaddq %xmm2, %xmm4, %xmm4
xorl %eax, %eax
cmpl $7, %ecx
ja LBB0_9
## BB#8:
movl $8, %eax
subl %ecx, %eax
cltq
LBB0_9: ## %read64.exit13
leaq _powers_of_ten_multipliers(%rip), %r11
vpmulld (%r11,%rax,4), %xmm3, %xmm3
vpmulld 16(%r11,%rax,4), %xmm4, %xmm4
vphaddd %xmm4, %xmm3, %xmm3
vphaddd %xmm3, %xmm3, %xmm3
vphaddd %xmm3, %xmm3, %xmm3
vmovd %xmm3, %r10d
cmpl $9, %ecx
jae LBB0_10
LBB0_16:
imulq %r9, %r10
movq %r10, (%r8)
movl %ecx, %eax
addq %rdi, %rax
movq %rax, (%rsi)
popq %rbx
popq %rbp
retq
LBB0_1:
movq %rdi, %rbx
LBB0_3: ## %.lr.ph.i
## =>This Inner Loop Header: Depth=1
cmpl $15, %ecx
jle LBB0_4
## BB#2: ## %tailrecurse.i
## in Loop: Header=BB0_3 Depth=1
vmovdqu 16(%rbx), %xmm0
addq $16, %rbx
movl $2, %eax
movl $16, %edx
vpcmpestri $16, %xmm0, %xmm1
testl %ecx, %ecx
je LBB0_5
jmp LBB0_3
LBB0_10:
vpsrldq $8, %xmm0, %xmm0
leal -8(%rcx), %eax
vpmovzxbd %xmm0, %xmm3
vpsrldq $4, %xmm0, %xmm0
vpmovzxbd %xmm0, %xmm4
vpaddq %xmm2, %xmm3, %xmm0
vpaddq %xmm2, %xmm4, %xmm3
xorl %edx, %edx
cmpl $7, %eax
ja LBB0_12
## BB#11:
movl $16, %edx
subl %ecx, %edx
movslq %edx, %rdx
LBB0_12: ## %read64.exit8
vpmulld (%r11,%rdx,4), %xmm0, %xmm0
vpmulld 16(%r11,%rdx,4), %xmm3, %xmm3
vphaddd %xmm3, %xmm0, %xmm0
vphaddd %xmm0, %xmm0, %xmm0
vphaddd %xmm0, %xmm0, %xmm0
vmovd %xmm0, %edx
movl %eax, %eax
leaq _powers_of_ten(%rip), %rbx
movl (%rbx,%rax,4), %eax
imulq %rax, %r10
addq %rdx, %r10
cmpl $16, %ecx
jne LBB0_16
## BB#13:
vmovdqu 16(%rdi), %xmm0
movl $2, %eax
movl $16, %edx
vpcmpestri $20, %xmm0, %xmm1
## kill: ECX<def> ECX<kill> RCX<def>
vpmovzxbd %xmm0, %xmm1
vpsrldq $4, %xmm0, %xmm0
vpmovzxbd %xmm0, %xmm3
vpaddq %xmm2, %xmm1, %xmm0
vpaddq %xmm2, %xmm3, %xmm1
xorl %eax, %eax
cmpl $7, %ecx
ja LBB0_15
## BB#14:
movl $8, %eax
subl %ecx, %eax
cltq
LBB0_15: ## %read64.exit
addq $16, %rdi
vpmulld (%r11,%rax,4), %xmm0, %xmm0
vpmulld 16(%r11,%rax,4), %xmm1, %xmm1
vphaddd %xmm1, %xmm0, %xmm0
vphaddd %xmm0, %xmm0, %xmm0
vphaddd %xmm0, %xmm0, %xmm0
vmovd %xmm0, %eax
movl %ecx, %edx
movl (%rbx,%rdx,4), %edx
imulq %r10, %rdx
addq %rax, %rdx
movq %rdx, %r10
jmp LBB0_16
LBB0_4:
movslq %ecx, %rax
leaq (%rbx,%rax), %rdi
vmovdqu (%rbx,%rax), %xmm0
jmp LBB0_5
.cfi_endproc
.section __TEXT,__const
.align 4 ## @powers_of_ten
_powers_of_ten:
.long 1 ## 0x1
.long 10 ## 0xa
.long 100 ## 0x64
.long 1000 ## 0x3e8
.long 10000 ## 0x2710
.long 100000 ## 0x186a0
.long 1000000 ## 0xf4240
.long 10000000 ## 0x989680
.long 100000000 ## 0x5f5e100
.align 4 ## @powers_of_ten_multipliers
_powers_of_ten_multipliers:
.long 10000000 ## 0x989680
.long 1000000 ## 0xf4240
.long 100000 ## 0x186a0
.long 10000 ## 0x2710
.long 1000 ## 0x3e8
.long 100 ## 0x64
.long 10 ## 0xa
.long 1 ## 0x1
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.long 0 ## 0x0
.subsections_via_symbols
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment