amosr · December 8, 2015 10:45
diff --git a/int.c b/int.c
 #include <x86intrin.h>
 #include <stdint.h>
 #include <stdio.h>


 static const uint32_t powers_of_ten_multipliers[]
 = { 10000000, 1000000, 100000, 10000
   , 1000    , 100    , 10    , 1
   , 0       , 0      , 0     , 0
   , 0       , 0      , 0     , 0
   , 0       , 0      , 0     , 0
   , 0       , 0      , 0     , 0
   };

 static const uint32_t powers_of_ten[]
 = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000};


 static __m128i strip_ws(const char* in, const char** out)
 {
    const __m128i white_range = _mm_setr_epi8(' ', '\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    const __m128i m = _mm_loadu_si128((__m128i*)in);

    const int index = _mm_cmpestri(white_range, 2, m, 16, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_NEGATIVE_POLARITY);

    if (__builtin_expect(index == 0, 1)) {
        return m;
    } else if (index < 16) {
        const char* in_new = in + index;
        *out = in_new;
        return _mm_loadu_si128((__m128i*)in_new);
    } else {
        return strip_ws(in + 16, out);
    }
 }

 static unsigned int first_nondigit(const __m128i m)
 {
    const __m128i digit_range = _mm_setr_epi8('0', '9', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    const int index = _mm_cmpestri(digit_range, 2, m, 16, _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_NEGATIVE_POLARITY);
    return index;
 }


 static unsigned int read64(const __m128i m, unsigned int index)
 {
    const __m128i lows = _mm_cvtepu8_epi32(m);
    const __m128i m_shifted = _mm_srli_si128(m, 4);
    const __m128i highs = _mm_cvtepu8_epi32(m_shifted);


    const uint32_t zero_words[] = { 48, 48, 48, 48 };
    const __m128i zeros = _mm_loadu_si128((__m128i*)zero_words);

    const __m128i low_digits = lows - zeros;
    const __m128i hi_digits = highs - zeros;

    const int pow_ten_offset = (index < 8) ? 8 - index : 0;

    const __m128i lo_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset));
    const __m128i hi_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset + 4));

    const __m128i lo_mulled = _mm_mullo_epi32(lo_muls, low_digits);
    const __m128i hi_mulled = _mm_mullo_epi32(hi_muls, hi_digits);

    const __m128i sum1 = _mm_hadd_epi32(lo_mulled, hi_mulled);
    const __m128i sum2 = _mm_hadd_epi32(sum1, sum1);
    const __m128i sum3 = _mm_hadd_epi32(sum2, sum2);

    return _mm_extract_epi32(sum3, 0);
 }

 void read_int(const char* in, const char** out_end, uint64_t* out_val)
 {
    __m128i m = strip_ws(in, &in);

    uint64_t sign = 1;
    if (_mm_extract_epi8(m, 0) == '-') {
        in++;
        m = _mm_loadu_si128((__m128i*)in);
        sign = -1;
    }

    unsigned int index = first_nondigit(m);

    uint64_t int_out = read64(m, index);

    if (__builtin_expect(index > 8, 0)) {
        const __m128i m_shift = _mm_srli_si128(m, 8);
        uint64_t i2 = read64(m_shift, index - 8);
        uint64_t mul = powers_of_ten[index - 8];
        int_out = int_out * mul + i2;

        if (__builtin_expect(index == 16, 0)) {
            in += 16;
            const __m128i m_left = _mm_loadu_si128((__m128i*)in);
            index = first_nondigit(m_left);
            if (__builtin_expect(index > 3, 0)) {
                // fprintf(stderr, "ERROR NUMBER TOO BIG\n");
            }

            uint64_t i3 = read64(m_left, index);
            int_out = int_out * powers_of_ten[index] + i3;
        }

    }

    *out_val = int_out * sign;
    *out_end = in + index;
 }

diff --git a/int.s b/int.s
 	.section	__TEXT,__text,regular,pure_instructions
 	.section	__TEXT,__const
 	.align	4
 LCPI0_0:
 	.byte	32                      ## 0x20
 	.byte	9                       ## 0x9
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 LCPI0_1:
 	.byte	48                      ## 0x30
 	.byte	57                      ## 0x39
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 	.byte	0                       ## 0x0
 LCPI0_2:
 	.quad	-206158430256           ## 0xffffffcfffffffd0
 	.quad	-206158430256           ## 0xffffffcfffffffd0
 	.section	__TEXT,__text,regular,pure_instructions
 	.globl	_read_int
 	.align	4, 0x90
 _read_int:                              ## @read_int
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp3:
 	.cfi_def_cfa_offset 16
 Ltmp4:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp5:
 	.cfi_def_cfa_register %rbp
 	pushq	%rbx
 Ltmp6:
 	.cfi_offset %rbx, -24
 	movq	%rdx, %r8
 	vmovdqu	(%rdi), %xmm0
 	vmovdqa	LCPI0_0(%rip), %xmm1
 	movl	$2, %eax
 	movl	$16, %edx
 	vpcmpestri	$16, %xmm0, %xmm1
 	testl	%ecx, %ecx
 	jne	LBB0_1
 LBB0_5:                                 ## %strip_ws.exit
 	vpextrb	$0, %xmm0, %eax
 	movl	$1, %r9d
 	cmpl	$45, %eax
 	jne	LBB0_7
 ## BB#6:
 	vmovdqu	1(%rdi), %xmm0
 	incq	%rdi
 	movq	$-1, %r9
 LBB0_7:
 	vmovdqa	LCPI0_1(%rip), %xmm1
 	movl	$2, %eax
 	movl	$16, %edx
 	vpcmpestri	$20, %xmm0, %xmm1
                                        ## kill: ECX<def> ECX<kill> RCX<def>
 	vpmovzxbd	%xmm0, %xmm3
 	vpsrldq	$4, %xmm0, %xmm2
 	vpmovzxbd	%xmm2, %xmm4
 	vmovdqa	LCPI0_2(%rip), %xmm2
 	vpaddq	%xmm2, %xmm3, %xmm3
 	vpaddq	%xmm2, %xmm4, %xmm4
 	xorl	%eax, %eax
 	cmpl	$7, %ecx
 	ja	LBB0_9
 ## BB#8:
 	movl	$8, %eax
 	subl	%ecx, %eax
 	cltq
 LBB0_9:                                 ## %read64.exit13
 	leaq	_powers_of_ten_multipliers(%rip), %r11
 	vpmulld	(%r11,%rax,4), %xmm3, %xmm3
 	vpmulld	16(%r11,%rax,4), %xmm4, %xmm4
 	vphaddd	%xmm4, %xmm3, %xmm3
 	vphaddd	%xmm3, %xmm3, %xmm3
 	vphaddd	%xmm3, %xmm3, %xmm3
 	vmovd	%xmm3, %r10d
 	cmpl	$9, %ecx
 	jae	LBB0_10
 LBB0_16:
 	imulq	%r9, %r10
 	movq	%r10, (%r8)
 	movl	%ecx, %eax
 	addq	%rdi, %rax
 	movq	%rax, (%rsi)
 	popq	%rbx
 	popq	%rbp
 	retq
 LBB0_1:
 	movq	%rdi, %rbx
 LBB0_3:                                 ## %.lr.ph.i
                                        ## =>This Inner Loop Header: Depth=1
 	cmpl	$15, %ecx
 	jle	LBB0_4
 ## BB#2:                                ## %tailrecurse.i
                                        ##   in Loop: Header=BB0_3 Depth=1
 	vmovdqu	16(%rbx), %xmm0
 	addq	$16, %rbx
 	movl	$2, %eax
 	movl	$16, %edx
 	vpcmpestri	$16, %xmm0, %xmm1
 	testl	%ecx, %ecx
 	je	LBB0_5
 	jmp	LBB0_3
 LBB0_10:
 	vpsrldq	$8, %xmm0, %xmm0
 	leal	-8(%rcx), %eax
 	vpmovzxbd	%xmm0, %xmm3
 	vpsrldq	$4, %xmm0, %xmm0
 	vpmovzxbd	%xmm0, %xmm4
 	vpaddq	%xmm2, %xmm3, %xmm0
 	vpaddq	%xmm2, %xmm4, %xmm3
 	xorl	%edx, %edx
 	cmpl	$7, %eax
 	ja	LBB0_12
 ## BB#11:
 	movl	$16, %edx
 	subl	%ecx, %edx
 	movslq	%edx, %rdx
 LBB0_12:                                ## %read64.exit8
 	vpmulld	(%r11,%rdx,4), %xmm0, %xmm0
 	vpmulld	16(%r11,%rdx,4), %xmm3, %xmm3
 	vphaddd	%xmm3, %xmm0, %xmm0
 	vphaddd	%xmm0, %xmm0, %xmm0
 	vphaddd	%xmm0, %xmm0, %xmm0
 	vmovd	%xmm0, %edx
 	movl	%eax, %eax
 	leaq	_powers_of_ten(%rip), %rbx
 	movl	(%rbx,%rax,4), %eax
 	imulq	%rax, %r10
 	addq	%rdx, %r10
 	cmpl	$16, %ecx
 	jne	LBB0_16
 ## BB#13:
 	vmovdqu	16(%rdi), %xmm0
 	movl	$2, %eax
 	movl	$16, %edx
 	vpcmpestri	$20, %xmm0, %xmm1
                                        ## kill: ECX<def> ECX<kill> RCX<def>
 	vpmovzxbd	%xmm0, %xmm1
 	vpsrldq	$4, %xmm0, %xmm0
 	vpmovzxbd	%xmm0, %xmm3
 	vpaddq	%xmm2, %xmm1, %xmm0
 	vpaddq	%xmm2, %xmm3, %xmm1
 	xorl	%eax, %eax
 	cmpl	$7, %ecx
 	ja	LBB0_15
 ## BB#14:
 	movl	$8, %eax
 	subl	%ecx, %eax
 	cltq
 LBB0_15:                                ## %read64.exit
 	addq	$16, %rdi
 	vpmulld	(%r11,%rax,4), %xmm0, %xmm0
 	vpmulld	16(%r11,%rax,4), %xmm1, %xmm1
 	vphaddd	%xmm1, %xmm0, %xmm0
 	vphaddd	%xmm0, %xmm0, %xmm0
 	vphaddd	%xmm0, %xmm0, %xmm0
 	vmovd	%xmm0, %eax
 	movl	%ecx, %edx
 	movl	(%rbx,%rdx,4), %edx
 	imulq	%r10, %rdx
 	addq	%rax, %rdx
 	movq	%rdx, %r10
 	jmp	LBB0_16
 LBB0_4:
 	movslq	%ecx, %rax
 	leaq	(%rbx,%rax), %rdi
 	vmovdqu	(%rbx,%rax), %xmm0
 	jmp	LBB0_5
 	.cfi_endproc

 	.section	__TEXT,__const
 	.align	4                       ## @powers_of_ten
 _powers_of_ten:
 	.long	1                       ## 0x1
 	.long	10                      ## 0xa
 	.long	100                     ## 0x64
 	.long	1000                    ## 0x3e8
 	.long	10000                   ## 0x2710
 	.long	100000                  ## 0x186a0
 	.long	1000000                 ## 0xf4240
 	.long	10000000                ## 0x989680
 	.long	100000000               ## 0x5f5e100

 	.align	4                       ## @powers_of_ten_multipliers
 _powers_of_ten_multipliers:
 	.long	10000000                ## 0x989680
 	.long	1000000                 ## 0xf4240
 	.long	100000                  ## 0x186a0
 	.long	10000                   ## 0x2710
 	.long	1000                    ## 0x3e8
 	.long	100                     ## 0x64
 	.long	10                      ## 0xa
 	.long	1                       ## 0x1
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0
 	.long	0                       ## 0x0


 .subsections_via_symbols
	#include <x86intrin.h>
	#include <stdint.h>
	#include <stdio.h>


	static const uint32_t powers_of_ten_multipliers[]
	= { 10000000, 1000000, 100000, 10000
	, 1000 , 100 , 10 , 1
	, 0 , 0 , 0 , 0
	, 0 , 0 , 0 , 0
	, 0 , 0 , 0 , 0
	, 0 , 0 , 0 , 0
	};

	static const uint32_t powers_of_ten[]
	= { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000};


	static __m128i strip_ws(const char* in, const char** out)
	{
	const __m128i white_range = _mm_setr_epi8(' ', '\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	const __m128i m = _mm_loadu_si128((__m128i*)in);

	const int index = _mm_cmpestri(white_range, 2, m, 16, _SIDD_UBYTE_OPS \| _SIDD_CMP_EQUAL_ANY \| _SIDD_NEGATIVE_POLARITY);

	if (__builtin_expect(index == 0, 1)) {
	return m;
	} else if (index < 16) {
	const char* in_new = in + index;
	*out = in_new;
	return _mm_loadu_si128((__m128i*)in_new);
	} else {
	return strip_ws(in + 16, out);
	}
	}

	static unsigned int first_nondigit(const __m128i m)
	{
	const __m128i digit_range = _mm_setr_epi8('0', '9', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	const int index = _mm_cmpestri(digit_range, 2, m, 16, _SIDD_UBYTE_OPS \| _SIDD_CMP_RANGES \| _SIDD_NEGATIVE_POLARITY);
	return index;
	}


	static unsigned int read64(const __m128i m, unsigned int index)
	{
	const __m128i lows = _mm_cvtepu8_epi32(m);
	const __m128i m_shifted = _mm_srli_si128(m, 4);
	const __m128i highs = _mm_cvtepu8_epi32(m_shifted);


	const uint32_t zero_words[] = { 48, 48, 48, 48 };
	const __m128i zeros = _mm_loadu_si128((__m128i*)zero_words);

	const __m128i low_digits = lows - zeros;
	const __m128i hi_digits = highs - zeros;

	const int pow_ten_offset = (index < 8) ? 8 - index : 0;

	const __m128i lo_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset));
	const __m128i hi_muls = _mm_loadu_si128((__m128i*)(powers_of_ten_multipliers + pow_ten_offset + 4));

	const __m128i lo_mulled = _mm_mullo_epi32(lo_muls, low_digits);
	const __m128i hi_mulled = _mm_mullo_epi32(hi_muls, hi_digits);

	const __m128i sum1 = _mm_hadd_epi32(lo_mulled, hi_mulled);
	const __m128i sum2 = _mm_hadd_epi32(sum1, sum1);
	const __m128i sum3 = _mm_hadd_epi32(sum2, sum2);

	return _mm_extract_epi32(sum3, 0);
	}

	void read_int(const char* in, const char** out_end, uint64_t* out_val)
	{
	__m128i m = strip_ws(in, &in);

	uint64_t sign = 1;
	if (_mm_extract_epi8(m, 0) == '-') {
	in++;
	m = _mm_loadu_si128((__m128i*)in);
	sign = -1;
	}

	unsigned int index = first_nondigit(m);

	uint64_t int_out = read64(m, index);

	if (__builtin_expect(index > 8, 0)) {
	const __m128i m_shift = _mm_srli_si128(m, 8);
	uint64_t i2 = read64(m_shift, index - 8);
	uint64_t mul = powers_of_ten[index - 8];
	int_out = int_out * mul + i2;

	if (__builtin_expect(index == 16, 0)) {
	in += 16;
	const __m128i m_left = _mm_loadu_si128((__m128i*)in);
	index = first_nondigit(m_left);
	if (__builtin_expect(index > 3, 0)) {
	// fprintf(stderr, "ERROR NUMBER TOO BIG\n");
	}

	uint64_t i3 = read64(m_left, index);
	int_out = int_out * powers_of_ten[index] + i3;
	}

	}

	out_val = int_out sign;
	*out_end = in + index;
	}
	.section __TEXT,__text,regular,pure_instructions
	.section __TEXT,__const
	.align 4
	LCPI0_0:
	.byte 32 ## 0x20
	.byte 9 ## 0x9
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	LCPI0_1:
	.byte 48 ## 0x30
	.byte 57 ## 0x39
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	.byte 0 ## 0x0
	LCPI0_2:
	.quad -206158430256 ## 0xffffffcfffffffd0
	.quad -206158430256 ## 0xffffffcfffffffd0
	.section __TEXT,__text,regular,pure_instructions
	.globl _read_int
	.align 4, 0x90
	_read_int: ## @read_int
	.cfi_startproc
	## BB#0:
	pushq %rbp
	Ltmp3:
	.cfi_def_cfa_offset 16
	Ltmp4:
	.cfi_offset %rbp, -16
	movq %rsp, %rbp
	Ltmp5:
	.cfi_def_cfa_register %rbp
	pushq %rbx
	Ltmp6:
	.cfi_offset %rbx, -24
	movq %rdx, %r8
	vmovdqu (%rdi), %xmm0
	vmovdqa LCPI0_0(%rip), %xmm1
	movl $2, %eax
	movl $16, %edx
	vpcmpestri $16, %xmm0, %xmm1
	testl %ecx, %ecx
	jne LBB0_1
	LBB0_5: ## %strip_ws.exit
	vpextrb $0, %xmm0, %eax
	movl $1, %r9d
	cmpl $45, %eax
	jne LBB0_7
	## BB#6:
	vmovdqu 1(%rdi), %xmm0
	incq %rdi
	movq $-1, %r9
	LBB0_7:
	vmovdqa LCPI0_1(%rip), %xmm1
	movl $2, %eax
	movl $16, %edx
	vpcmpestri $20, %xmm0, %xmm1
	## kill: ECX<def> ECX<kill> RCX<def>
	vpmovzxbd %xmm0, %xmm3
	vpsrldq $4, %xmm0, %xmm2
	vpmovzxbd %xmm2, %xmm4
	vmovdqa LCPI0_2(%rip), %xmm2
	vpaddq %xmm2, %xmm3, %xmm3
	vpaddq %xmm2, %xmm4, %xmm4
	xorl %eax, %eax
	cmpl $7, %ecx
	ja LBB0_9
	## BB#8:
	movl $8, %eax
	subl %ecx, %eax
	cltq
	LBB0_9: ## %read64.exit13
	leaq _powers_of_ten_multipliers(%rip), %r11
	vpmulld (%r11,%rax,4), %xmm3, %xmm3
	vpmulld 16(%r11,%rax,4), %xmm4, %xmm4
	vphaddd %xmm4, %xmm3, %xmm3
	vphaddd %xmm3, %xmm3, %xmm3
	vphaddd %xmm3, %xmm3, %xmm3
	vmovd %xmm3, %r10d
	cmpl $9, %ecx
	jae LBB0_10
	LBB0_16:
	imulq %r9, %r10
	movq %r10, (%r8)
	movl %ecx, %eax
	addq %rdi, %rax
	movq %rax, (%rsi)
	popq %rbx
	popq %rbp
	retq
	LBB0_1:
	movq %rdi, %rbx
	LBB0_3: ## %.lr.ph.i
	## =>This Inner Loop Header: Depth=1
	cmpl $15, %ecx
	jle LBB0_4
	## BB#2: ## %tailrecurse.i
	## in Loop: Header=BB0_3 Depth=1
	vmovdqu 16(%rbx), %xmm0
	addq $16, %rbx
	movl $2, %eax
	movl $16, %edx
	vpcmpestri $16, %xmm0, %xmm1
	testl %ecx, %ecx
	je LBB0_5
	jmp LBB0_3
	LBB0_10:
	vpsrldq $8, %xmm0, %xmm0
	leal -8(%rcx), %eax
	vpmovzxbd %xmm0, %xmm3
	vpsrldq $4, %xmm0, %xmm0
	vpmovzxbd %xmm0, %xmm4
	vpaddq %xmm2, %xmm3, %xmm0
	vpaddq %xmm2, %xmm4, %xmm3
	xorl %edx, %edx
	cmpl $7, %eax
	ja LBB0_12
	## BB#11:
	movl $16, %edx
	subl %ecx, %edx
	movslq %edx, %rdx
	LBB0_12: ## %read64.exit8
	vpmulld (%r11,%rdx,4), %xmm0, %xmm0
	vpmulld 16(%r11,%rdx,4), %xmm3, %xmm3
	vphaddd %xmm3, %xmm0, %xmm0
	vphaddd %xmm0, %xmm0, %xmm0
	vphaddd %xmm0, %xmm0, %xmm0
	vmovd %xmm0, %edx
	movl %eax, %eax
	leaq _powers_of_ten(%rip), %rbx
	movl (%rbx,%rax,4), %eax
	imulq %rax, %r10
	addq %rdx, %r10
	cmpl $16, %ecx
	jne LBB0_16
	## BB#13:
	vmovdqu 16(%rdi), %xmm0
	movl $2, %eax
	movl $16, %edx
	vpcmpestri $20, %xmm0, %xmm1
	## kill: ECX<def> ECX<kill> RCX<def>
	vpmovzxbd %xmm0, %xmm1
	vpsrldq $4, %xmm0, %xmm0
	vpmovzxbd %xmm0, %xmm3
	vpaddq %xmm2, %xmm1, %xmm0
	vpaddq %xmm2, %xmm3, %xmm1
	xorl %eax, %eax
	cmpl $7, %ecx
	ja LBB0_15
	## BB#14:
	movl $8, %eax
	subl %ecx, %eax
	cltq
	LBB0_15: ## %read64.exit
	addq $16, %rdi
	vpmulld (%r11,%rax,4), %xmm0, %xmm0
	vpmulld 16(%r11,%rax,4), %xmm1, %xmm1
	vphaddd %xmm1, %xmm0, %xmm0
	vphaddd %xmm0, %xmm0, %xmm0
	vphaddd %xmm0, %xmm0, %xmm0
	vmovd %xmm0, %eax
	movl %ecx, %edx
	movl (%rbx,%rdx,4), %edx
	imulq %r10, %rdx
	addq %rax, %rdx
	movq %rdx, %r10
	jmp LBB0_16
	LBB0_4:
	movslq %ecx, %rax
	leaq (%rbx,%rax), %rdi
	vmovdqu (%rbx,%rax), %xmm0
	jmp LBB0_5
	.cfi_endproc

	.section __TEXT,__const
	.align 4 ## @powers_of_ten
	_powers_of_ten:
	.long 1 ## 0x1
	.long 10 ## 0xa
	.long 100 ## 0x64
	.long 1000 ## 0x3e8
	.long 10000 ## 0x2710
	.long 100000 ## 0x186a0
	.long 1000000 ## 0xf4240
	.long 10000000 ## 0x989680
	.long 100000000 ## 0x5f5e100

	.align 4 ## @powers_of_ten_multipliers
	_powers_of_ten_multipliers:
	.long 10000000 ## 0x989680
	.long 1000000 ## 0xf4240
	.long 100000 ## 0x186a0
	.long 10000 ## 0x2710
	.long 1000 ## 0x3e8
	.long 100 ## 0x64
	.long 10 ## 0xa
	.long 1 ## 0x1
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0
	.long 0 ## 0x0


	.subsections_via_symbols