Last active
April 19, 2018 14:55
-
-
Save comzyh/5ba5ebeb0841661d1b006d3291404464 to your computer and use it in GitHub Desktop.
Bit Count test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.file "a.cpp" | |
.text | |
.section .rodata.str1.1,"aMS",@progbits,1 | |
.LC10: | |
.string "a.cpp" | |
.section .rodata.str1.8,"aMS",@progbits,1 | |
.align 8 | |
.LC11: | |
.string "bits1(arr[i]) == bits2(arr[i])" | |
.section .rodata.str1.1 | |
.LC12: | |
.string "start to run." | |
.section .rodata.str1.8 | |
.align 8 | |
.LC14: | |
.string "bits1: %5lu, bits2: %5lu, builtin: %5lu\n" | |
.section .rodata.str1.1 | |
.LC15: | |
.string "%d\n" | |
.section .text.startup,"ax",@progbits | |
.p2align 4,,15 | |
.globl main | |
.type main, @function | |
main: | |
.LFB86: | |
.cfi_startproc | |
leaq 8(%rsp), %r10 | |
.cfi_def_cfa 10, 0 | |
andq $-32, %rsp | |
pushq -8(%r10) | |
pushq %rbp | |
.cfi_escape 0x10,0x6,0x2,0x76,0 | |
movq %rsp, %rbp | |
pushq %r15 | |
pushq %r14 | |
pushq %r13 | |
pushq %r12 | |
.cfi_escape 0x10,0xf,0x2,0x76,0x78 | |
.cfi_escape 0x10,0xe,0x2,0x76,0x70 | |
.cfi_escape 0x10,0xd,0x2,0x76,0x68 | |
.cfi_escape 0x10,0xc,0x2,0x76,0x60 | |
leaq table(%rip), %r14 | |
pushq %r10 | |
.cfi_escape 0xf,0x3,0x76,0x58,0x6 | |
pushq %rbx | |
leaq 256(%r14), %rdx | |
movq %r14, %rax | |
subq $288, %rsp | |
.cfi_escape 0x10,0x3,0x2,0x76,0x50 | |
vmovdqa .LC0(%rip), %ymm4 | |
vmovdqa .LC1(%rip), %ymm11 | |
vmovdqa .LC2(%rip), %ymm10 | |
vmovdqa .LC3(%rip), %ymm9 | |
vmovdqa .LC4(%rip), %ymm8 | |
vmovdqa .LC5(%rip), %ymm2 | |
vmovdqa .LC6(%rip), %ymm3 | |
vmovdqa .LC7(%rip), %ymm5 | |
vmovdqa .LC8(%rip), %ymm7 | |
vmovdqa .LC9(%rip), %ymm6 | |
.L2: | |
vpand %ymm2, %ymm4, %ymm13 | |
vpsrad $1, %ymm4, %ymm0 | |
vpaddd %ymm10, %ymm4, %ymm12 | |
vpand %ymm2, %ymm0, %ymm0 | |
vpaddd %ymm9, %ymm4, %ymm14 | |
vpaddd %ymm8, %ymm4, %ymm1 | |
addq $32, %rax | |
vpaddd %ymm11, %ymm4, %ymm4 | |
vpaddd %ymm13, %ymm0, %ymm0 | |
vpsrad $1, %ymm12, %ymm13 | |
vpand %ymm2, %ymm12, %ymm12 | |
vpand %ymm2, %ymm13, %ymm13 | |
vpaddd %ymm12, %ymm13, %ymm13 | |
vpsrad $1, %ymm14, %ymm12 | |
vpand %ymm2, %ymm14, %ymm14 | |
vpand %ymm2, %ymm12, %ymm12 | |
vpaddd %ymm14, %ymm12, %ymm12 | |
vpsrad $1, %ymm1, %ymm14 | |
vpand %ymm2, %ymm1, %ymm1 | |
vpand %ymm2, %ymm14, %ymm14 | |
vpaddd %ymm1, %ymm14, %ymm14 | |
vpsrad $2, %ymm0, %ymm1 | |
vpand %ymm3, %ymm0, %ymm0 | |
vpand %ymm3, %ymm1, %ymm1 | |
vpaddd %ymm0, %ymm1, %ymm1 | |
vpsrad $2, %ymm13, %ymm0 | |
vpand %ymm3, %ymm13, %ymm13 | |
vpand %ymm3, %ymm0, %ymm0 | |
vpaddd %ymm13, %ymm0, %ymm0 | |
vpsrad $2, %ymm12, %ymm13 | |
vpand %ymm3, %ymm12, %ymm12 | |
vpand %ymm3, %ymm13, %ymm13 | |
vpaddd %ymm12, %ymm13, %ymm13 | |
vpsrad $2, %ymm14, %ymm12 | |
vpand %ymm3, %ymm14, %ymm14 | |
vpand %ymm3, %ymm12, %ymm12 | |
vpaddd %ymm14, %ymm12, %ymm12 | |
vpsrad $4, %ymm1, %ymm14 | |
vpand %ymm5, %ymm1, %ymm1 | |
vpaddd %ymm1, %ymm14, %ymm1 | |
vpsrad $4, %ymm0, %ymm14 | |
vpand %ymm5, %ymm0, %ymm0 | |
vpaddd %ymm0, %ymm14, %ymm14 | |
vpand %ymm1, %ymm7, %ymm0 | |
vpand %ymm14, %ymm7, %ymm1 | |
vpackusdw %ymm1, %ymm0, %ymm0 | |
vpsrad $4, %ymm13, %ymm1 | |
vpand %ymm5, %ymm13, %ymm13 | |
vpermq $216, %ymm0, %ymm0 | |
vpaddd %ymm13, %ymm1, %ymm1 | |
vpsrad $4, %ymm12, %ymm13 | |
vpand %ymm5, %ymm12, %ymm12 | |
vpand %ymm1, %ymm7, %ymm1 | |
vpand %ymm0, %ymm6, %ymm0 | |
vpaddd %ymm12, %ymm13, %ymm12 | |
vpand %ymm12, %ymm7, %ymm12 | |
vpackusdw %ymm12, %ymm1, %ymm1 | |
vpermq $216, %ymm1, %ymm1 | |
vpand %ymm1, %ymm6, %ymm1 | |
vpackuswb %ymm1, %ymm0, %ymm0 | |
vpermq $216, %ymm0, %ymm0 | |
vmovdqa %ymm0, -32(%rax) | |
cmpq %rax, %rdx | |
jne .L2 | |
movl $11, %edi | |
vmovdqa %ymm7, -176(%rbp) | |
vmovdqa %ymm5, -144(%rbp) | |
vmovdqa %ymm3, -112(%rbp) | |
vmovdqa %ymm2, -80(%rbp) | |
vzeroupper | |
leaq arr(%rip), %r12 | |
call srand@PLT | |
vmovdqa -176(%rbp), %ymm7 | |
xorl %ecx, %ecx | |
movl $1321528399, %esi | |
vmovdqa -144(%rbp), %ymm5 | |
vmovdqa -112(%rbp), %ymm3 | |
vmovdqa -80(%rbp), %ymm2 | |
.p2align 4,,10 | |
.p2align 3 | |
.L4: | |
movl %ecx, %eax | |
leal 833(%rcx), %edi | |
movl %ecx, %ebx | |
mull %esi | |
shrl $2, %edx | |
leal (%rdx,%rdx,2), %eax | |
leal (%rdx,%rax,4), %eax | |
subl %eax, %ebx | |
shlx %ebx, %edi, %eax | |
movl %eax, %edx | |
movl %eax, %edi | |
movl %eax, (%r12,%rcx,4) | |
sarl %edx | |
andl $1431655765, %edi | |
andl $1431655765, %edx | |
addl %edi, %edx | |
movl %edx, %edi | |
sarl $2, %edx | |
andl $858993459, %edi | |
andl $858993459, %edx | |
addl %edi, %edx | |
movl %edx, %edi | |
sarl $4, %edx | |
andl $252645135, %edi | |
andl $252645135, %edx | |
addl %edi, %edx | |
movl %edx, %edi | |
sarl $8, %edx | |
andl $16711935, %edi | |
andl $16711935, %edx | |
addl %edi, %edx | |
movzbl %ah, %edi | |
movsbl (%r14,%rdi), %r10d | |
movzbl %al, %edi | |
movzwl %dx, %r8d | |
movsbl (%r14,%rdi), %edi | |
shrl $16, %edx | |
addl %r8d, %edx | |
addl %r10d, %edi | |
movl %eax, %r10d | |
shrl $24, %eax | |
sarl $16, %r10d | |
movsbl (%r14,%rax), %eax | |
movzbl %r10b, %r10d | |
movsbl (%r14,%r10), %r10d | |
addl %r10d, %eax | |
addl %edi, %eax | |
cmpl %edx, %eax | |
jne .L17 | |
addq $1, %rcx | |
cmpq $104857600, %rcx | |
jne .L4 | |
leaq .LC12(%rip), %rdi | |
vmovdqa %ymm7, -176(%rbp) | |
vmovdqa %ymm5, -144(%rbp) | |
vmovdqa %ymm3, -112(%rbp) | |
vmovdqa %ymm2, -80(%rbp) | |
vzeroupper | |
xorl %r15d, %r15d | |
call puts@PLT | |
vmovdqa .LC13(%rip), %ymm6 | |
vmovdqa -176(%rbp), %ymm7 | |
movl $10, -308(%rbp) | |
vmovdqa -144(%rbp), %ymm5 | |
vmovdqa -112(%rbp), %ymm3 | |
vmovdqa -80(%rbp), %ymm2 | |
.p2align 4,,10 | |
.p2align 3 | |
.L8: | |
vmovdqa %ymm6, -240(%rbp) | |
vmovdqa %ymm7, -208(%rbp) | |
vmovdqa %ymm5, -176(%rbp) | |
vmovdqa %ymm3, -144(%rbp) | |
vmovdqa %ymm2, -112(%rbp) | |
vzeroupper | |
leaq arr(%rip), %rbx | |
call clock@PLT | |
vpxor %xmm4, %xmm4, %xmm4 | |
vmovdqa -240(%rbp), %ymm6 | |
vmovdqa -208(%rbp), %ymm7 | |
movq %rax, -80(%rbp) | |
movq %rbx, %r13 | |
vmovdqa -176(%rbp), %ymm5 | |
leaq 419430400(%r12), %rax | |
movq %rbx, %r12 | |
vmovdqa -144(%rbp), %ymm3 | |
vmovdqa -112(%rbp), %ymm2 | |
.p2align 4,,10 | |
.p2align 3 | |
.L5: | |
vmovdqa 0(%r13), %ymm0 | |
addq $32, %r13 | |
cmpq %rax, %r13 | |
vpsrad $1, %ymm0, %ymm1 | |
vpand %ymm2, %ymm0, %ymm0 | |
vpand %ymm2, %ymm1, %ymm1 | |
vpaddd %ymm0, %ymm1, %ymm1 | |
vpsrad $2, %ymm1, %ymm0 | |
vpand %ymm3, %ymm1, %ymm1 | |
vpand %ymm3, %ymm0, %ymm0 | |
vpaddd %ymm1, %ymm0, %ymm0 | |
vpsrad $4, %ymm0, %ymm1 | |
vpand %ymm5, %ymm0, %ymm0 | |
vpand %ymm5, %ymm1, %ymm1 | |
vpaddd %ymm0, %ymm1, %ymm1 | |
vpsrad $8, %ymm1, %ymm0 | |
vpand %ymm6, %ymm1, %ymm1 | |
vpand %ymm6, %ymm0, %ymm0 | |
vpaddd %ymm1, %ymm0, %ymm0 | |
vpsrld $16, %ymm0, %ymm1 | |
vpand %ymm7, %ymm0, %ymm0 | |
vpaddd %ymm0, %ymm1, %ymm0 | |
vpaddd %ymm0, %ymm4, %ymm4 | |
jne .L5 | |
vmovdqa %ymm7, -240(%rbp) | |
vpxor %xmm7, %xmm7, %xmm7 | |
vmovdqa %ymm6, -272(%rbp) | |
vmovdqa %ymm5, -208(%rbp) | |
vperm2i128 $33, %ymm7, %ymm4, %ymm0 | |
vmovdqa %ymm3, -176(%rbp) | |
vmovdqa %ymm2, -144(%rbp) | |
vpaddd %ymm4, %ymm0, %ymm0 | |
vperm2i128 $33, %ymm7, %ymm0, %ymm1 | |
vpalignr $8, %ymm0, %ymm1, %ymm1 | |
vpaddd %ymm1, %ymm0, %ymm0 | |
vperm2i128 $33, %ymm7, %ymm0, %ymm1 | |
vpalignr $4, %ymm0, %ymm1, %ymm1 | |
vpaddd %ymm1, %ymm0, %ymm0 | |
vmovd %xmm0, %eax | |
addl %eax, %r15d | |
vzeroupper | |
call clock@PLT | |
leaq arr(%rip), %rdx | |
vmovdqa -272(%rbp), %ymm6 | |
movq %rax, -112(%rbp) | |
vmovdqa -240(%rbp), %ymm7 | |
vmovdqa -208(%rbp), %ymm5 | |
vmovdqa -176(%rbp), %ymm3 | |
vmovdqa -144(%rbp), %ymm2 | |
.p2align 4,,10 | |
.p2align 3 | |
.L6: | |
movl (%rdx), %eax | |
addq $4, %rdx | |
movzbl %ah, %edi | |
movsbl (%r14,%rdi), %r8d | |
movzbl %al, %edi | |
movsbl (%r14,%rdi), %edi | |
addl %r8d, %edi | |
movl %eax, %r8d | |
shrl $24, %eax | |
sarl $16, %r8d | |
movsbl (%r14,%rax), %eax | |
movzbl %r8b, %r8d | |
movsbl (%r14,%r8), %r8d | |
addl %r8d, %eax | |
addl %edi, %eax | |
addl %eax, %r15d | |
cmpq %rdx, %r13 | |
jne .L6 | |
vmovdqa %ymm6, -304(%rbp) | |
movq %rdx, -144(%rbp) | |
vmovdqa %ymm7, -272(%rbp) | |
vmovdqa %ymm5, -240(%rbp) | |
vmovdqa %ymm3, -208(%rbp) | |
vmovdqa %ymm2, -176(%rbp) | |
vzeroupper | |
call clock@PLT | |
movq -144(%rbp), %rdx | |
movq %rax, %r13 | |
vmovdqa -304(%rbp), %ymm6 | |
vmovdqa -272(%rbp), %ymm7 | |
vmovdqa -240(%rbp), %ymm5 | |
vmovdqa -208(%rbp), %ymm3 | |
vmovdqa -176(%rbp), %ymm2 | |
.p2align 4,,10 | |
.p2align 3 | |
.L7: | |
xorl %eax, %eax | |
popcntl (%rbx), %eax | |
addq $4, %rbx | |
addl %eax, %r15d | |
cmpq %rbx, %rdx | |
jne .L7 | |
vmovdqa %ymm6, -272(%rbp) | |
vmovdqa %ymm7, -240(%rbp) | |
vmovdqa %ymm5, -208(%rbp) | |
vmovdqa %ymm3, -176(%rbp) | |
vmovdqa %ymm2, -144(%rbp) | |
vzeroupper | |
call clock@PLT | |
movq -112(%rbp), %rdx | |
movq %r13, %rcx | |
subq %r13, %rax | |
leaq .LC14(%rip), %rsi | |
movq %rax, %r8 | |
movl $1, %edi | |
xorl %eax, %eax | |
subq %rdx, %rcx | |
subq -80(%rbp), %rdx | |
call __printf_chk@PLT | |
subl $1, -308(%rbp) | |
vmovdqa -144(%rbp), %ymm2 | |
vmovdqa -176(%rbp), %ymm3 | |
vmovdqa -208(%rbp), %ymm5 | |
vmovdqa -240(%rbp), %ymm7 | |
vmovdqa -272(%rbp), %ymm6 | |
jne .L8 | |
leaq .LC15(%rip), %rsi | |
movl %r15d, %edx | |
movl $1, %edi | |
xorl %eax, %eax | |
vzeroupper | |
call __printf_chk@PLT | |
addq $288, %rsp | |
xorl %eax, %eax | |
popq %rbx | |
popq %r10 | |
.cfi_remember_state | |
.cfi_def_cfa 10, 0 | |
popq %r12 | |
popq %r13 | |
popq %r14 | |
popq %r15 | |
popq %rbp | |
leaq -8(%r10), %rsp | |
.cfi_def_cfa 7, 8 | |
ret | |
.L17: | |
.cfi_restore_state | |
leaq _ZZ4mainE19__PRETTY_FUNCTION__(%rip), %rcx | |
leaq .LC10(%rip), %rsi | |
leaq .LC11(%rip), %rdi | |
movl $32, %edx | |
vzeroupper | |
call __assert_fail@PLT | |
.cfi_endproc | |
.LFE86: | |
.size main, .-main | |
.section .rodata | |
.align 8 | |
.type _ZZ4mainE19__PRETTY_FUNCTION__, @object | |
.size _ZZ4mainE19__PRETTY_FUNCTION__, 11 | |
_ZZ4mainE19__PRETTY_FUNCTION__: | |
.string "int main()" | |
.globl arr | |
.bss | |
.align 32 | |
.type arr, @object | |
.size arr, 419430400 | |
arr: | |
.zero 419430400 | |
.globl table | |
.align 32 | |
.type table, @object | |
.size table, 256 | |
table: | |
.zero 256 | |
.section .rodata.cst32,"aM",@progbits,32 | |
.align 32 | |
.LC0: | |
.long 0 | |
.long 1 | |
.long 2 | |
.long 3 | |
.long 4 | |
.long 5 | |
.long 6 | |
.long 7 | |
.align 32 | |
.LC1: | |
.long 32 | |
.long 32 | |
.long 32 | |
.long 32 | |
.long 32 | |
.long 32 | |
.long 32 | |
.long 32 | |
.align 32 | |
.LC2: | |
.long 8 | |
.long 8 | |
.long 8 | |
.long 8 | |
.long 8 | |
.long 8 | |
.long 8 | |
.long 8 | |
.align 32 | |
.LC3: | |
.long 16 | |
.long 16 | |
.long 16 | |
.long 16 | |
.long 16 | |
.long 16 | |
.long 16 | |
.long 16 | |
.align 32 | |
.LC4: | |
.long 24 | |
.long 24 | |
.long 24 | |
.long 24 | |
.long 24 | |
.long 24 | |
.long 24 | |
.long 24 | |
.align 32 | |
.LC5: | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.align 32 | |
.LC6: | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.align 32 | |
.LC7: | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.align 32 | |
.LC8: | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.align 32 | |
.LC9: | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.value 255 | |
.align 32 | |
.LC13: | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.ident "GCC: (Ubuntu 7.3.0-15ubuntu2) 7.3.0" | |
.section .note.GNU-stack,"",@progbits |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cstring> | |
#include <time.h> | |
#include <cassert> | |
#include <cstdlib> | |
char table[256]; | |
int arr[100 << 20]; | |
inline int bits1(int x) { | |
x = (x & 0x55555555) + ((x >> 1 ) & 0x55555555); | |
x = (x & 0x33333333) + ((x >> 2 ) & 0x33333333); | |
x = (x & 0x0f0f0f0f) + ((x >> 4 ) & 0x0f0f0f0f); | |
x = (x & 0x00ff00ff) + ((x >> 8 ) & 0x00ff00ff); | |
x = (x & 0x0000ffff) + ((x >> 16) & 0x0000ffff); | |
return x; | |
} | |
inline int bits2(int x) { | |
int ans = 0; | |
ans += table[x & 0xff] + table[(x & 0xff00) >> 8]; | |
x >>= 16; | |
ans += table[x & 0xff] + table[(x & 0xff00) >> 8]; | |
return ans; | |
} | |
int main() | |
{ | |
for (int i = 0; i < 256; i++) { | |
table[i] = bits1(i); | |
} | |
srand (11); | |
for (int i = 0; i < sizeof(arr) / sizeof(int); i++) { | |
arr[i] = (0x341 + i) << (i % 13); | |
assert(bits1(arr[i]) == bits2(arr[i])); | |
} | |
printf("start to run.\n"); | |
int nonsense = 0; | |
const int N = sizeof(arr) / sizeof(int); | |
for (int epoch = 0; epoch < 10; epoch ++) { | |
clock_t start = clock(); | |
for (int i = 0; i < N; i++) | |
nonsense += bits1(arr[i]); | |
clock_t t1 = clock(); | |
for (int i = 0; i < N; i++) | |
nonsense += bits2(arr[i]); | |
clock_t t2 = clock(); | |
for (int i = 0; i < N; i++) | |
nonsense += __builtin_popcount(arr[i]); | |
clock_t t3 = clock(); | |
printf("bits1: %5lu, bits2: %5lu, builtin: %5lu\n", t1 - start, t2 - t1, t3 - t2); | |
} | |
printf("%d\n", nonsense); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.file "A.cpp" | |
.section .rdata,"dr" | |
.align 8 | |
.LC0: | |
.ascii "bits1: %5lu, bits2: %5lu, builtin: %5lu\12\0" | |
.text | |
.p2align 4,,15 | |
.def _Z6printfPKcz.constprop.0; .scl 3; .type 32; .endef | |
.seh_proc _Z6printfPKcz.constprop.0 | |
_Z6printfPKcz.constprop.0: | |
.LFB91: | |
subq $56, %rsp | |
.seh_stackalloc 56 | |
.seh_endprologue | |
leaq .LC0(%rip), %rcx | |
movq %rdx, 72(%rsp) | |
leaq 72(%rsp), %rdx | |
movq %r8, 80(%rsp) | |
movq %r9, 88(%rsp) | |
movq %rdx, 40(%rsp) | |
call __mingw_vprintf | |
addq $56, %rsp | |
ret | |
.seh_endproc | |
.section .text$_Z6printfPKcz,"x" | |
.linkonce discard | |
.p2align 4,,15 | |
.globl _Z6printfPKcz | |
.def _Z6printfPKcz; .scl 2; .type 32; .endef | |
.seh_proc _Z6printfPKcz | |
_Z6printfPKcz: | |
.LFB8: | |
subq $56, %rsp | |
.seh_stackalloc 56 | |
.seh_endprologue | |
movq %rdx, 72(%rsp) | |
leaq 72(%rsp), %rdx | |
movq %r8, 80(%rsp) | |
movq %r9, 88(%rsp) | |
movq %rdx, 40(%rsp) | |
call __mingw_vprintf | |
addq $56, %rsp | |
ret | |
.seh_endproc | |
.def __main; .scl 2; .type 32; .endef | |
.section .rdata,"dr" | |
.col0: | |
.ascii "A.cpp\0" | |
.align 8 | |
.LC11: | |
.ascii "bits1(arr[i]) == bits2(arr[i])\0" | |
.LC12: | |
.ascii "start to run.\12\0" | |
.LC15: | |
.ascii "%d\12\0" | |
.section .text.startup,"x" | |
.p2align 4,,15 | |
.globl main | |
.def main; .scl 2; .type 32; .endef | |
.seh_proc main | |
main: | |
.LFB90: | |
pushq %r15 | |
.seh_pushreg %r15 | |
pushq %r14 | |
.seh_pushreg %r14 | |
pushq %r13 | |
.seh_pushreg %r13 | |
pushq %r12 | |
.seh_pushreg %r12 | |
pushq %rbp | |
.seh_pushreg %rbp | |
pushq %rdi | |
.seh_pushreg %rdi | |
pushq %rsi | |
.seh_pushreg %rsi | |
pushq %rbx | |
.seh_pushreg %rbx | |
subq $200, %rsp | |
.seh_stackalloc 200 | |
movaps %xmm6, 48(%rsp) | |
.seh_savexmm %xmm6, 48 | |
movaps %xmm7, 64(%rsp) | |
.seh_savexmm %xmm7, 64 | |
movaps %xmm8, 80(%rsp) | |
.seh_savexmm %xmm8, 80 | |
movaps %xmm9, 96(%rsp) | |
.seh_savexmm %xmm9, 96 | |
movaps %xmm10, 112(%rsp) | |
.seh_savexmm %xmm10, 112 | |
movaps %xmm11, 128(%rsp) | |
.seh_savexmm %xmm11, 128 | |
movaps %xmm12, 144(%rsp) | |
.seh_savexmm %xmm12, 144 | |
movaps %xmm13, 160(%rsp) | |
.seh_savexmm %xmm13, 160 | |
movaps %xmm14, 176(%rsp) | |
.seh_savexmm %xmm14, 176 | |
.seh_endprologue | |
leaq table(%rip), %r12 | |
call __main | |
leaq 256+table(%rip), %rdx | |
movq %r12, %rax | |
movdqa .LC1(%rip), %xmm2 | |
movdqa .LC2(%rip), %xmm10 | |
movdqa .LC3(%rip), %xmm9 | |
movdqa .LC4(%rip), %xmm5 | |
movdqa .LC5(%rip), %xmm4 | |
movdqa .LC6(%rip), %xmm6 | |
movdqa .LC7(%rip), %xmm7 | |
movdqa .LC8(%rip), %xmm8 | |
movdqa .LC9(%rip), %xmm3 | |
.L4: | |
movdqa %xmm2, %xmm12 | |
movdqa %xmm2, %xmm1 | |
movdqa %xmm2, %xmm0 | |
psrad $1, %xmm12 | |
paddd %xmm9, %xmm1 | |
paddd %xmm5, %xmm0 | |
pand %xmm6, %xmm12 | |
movdqa %xmm12, %xmm11 | |
movdqa %xmm2, %xmm12 | |
movdqa %xmm2, %xmm13 | |
pand %xmm6, %xmm12 | |
paddd %xmm11, %xmm12 | |
movdqa %xmm1, %xmm11 | |
pand %xmm6, %xmm1 | |
psrad $1, %xmm11 | |
paddd %xmm4, %xmm13 | |
movdqa %xmm12, %xmm14 | |
pand %xmm7, %xmm12 | |
pand %xmm6, %xmm11 | |
paddd %xmm1, %xmm11 | |
movdqa %xmm0, %xmm1 | |
pand %xmm6, %xmm0 | |
psrad $1, %xmm1 | |
addq $16, %rax | |
paddd %xmm10, %xmm2 | |
pand %xmm6, %xmm1 | |
paddd %xmm0, %xmm1 | |
movdqa %xmm13, %xmm0 | |
pand %xmm6, %xmm13 | |
psrad $1, %xmm0 | |
pand %xmm6, %xmm0 | |
paddd %xmm13, %xmm0 | |
movdqa %xmm11, %xmm13 | |
pand %xmm7, %xmm11 | |
psrad $2, %xmm13 | |
psrad $2, %xmm14 | |
pand %xmm7, %xmm13 | |
paddd %xmm11, %xmm13 | |
movdqa %xmm1, %xmm11 | |
pand %xmm7, %xmm14 | |
psrad $2, %xmm11 | |
paddd %xmm12, %xmm14 | |
movdqa %xmm0, %xmm12 | |
pand %xmm7, %xmm11 | |
pand %xmm7, %xmm1 | |
psrad $2, %xmm12 | |
pand %xmm7, %xmm0 | |
paddd %xmm11, %xmm1 | |
movdqa %xmm14, %xmm11 | |
pand %xmm7, %xmm12 | |
paddd %xmm0, %xmm12 | |
psrad $4, %xmm11 | |
movdqa %xmm13, %xmm0 | |
pand %xmm8, %xmm14 | |
pand %xmm8, %xmm13 | |
paddd %xmm14, %xmm11 | |
psrad $4, %xmm0 | |
paddd %xmm13, %xmm0 | |
movdqa %xmm11, %xmm13 | |
punpckhwd %xmm0, %xmm13 | |
punpcklwd %xmm0, %xmm11 | |
movdqa %xmm11, %xmm0 | |
punpcklwd %xmm13, %xmm11 | |
punpckhwd %xmm13, %xmm0 | |
punpcklwd %xmm0, %xmm11 | |
movdqa %xmm1, %xmm0 | |
pand %xmm8, %xmm1 | |
psrad $4, %xmm0 | |
paddd %xmm1, %xmm0 | |
movdqa %xmm12, %xmm1 | |
pand %xmm8, %xmm12 | |
psrad $4, %xmm1 | |
paddd %xmm12, %xmm1 | |
movdqa %xmm0, %xmm12 | |
punpckhwd %xmm1, %xmm12 | |
punpcklwd %xmm1, %xmm0 | |
movdqa %xmm0, %xmm1 | |
punpcklwd %xmm12, %xmm0 | |
punpckhwd %xmm12, %xmm1 | |
punpcklwd %xmm1, %xmm0 | |
movdqa %xmm11, %xmm1 | |
pand %xmm3, %xmm0 | |
pand %xmm3, %xmm1 | |
movdqa %xmm1, %xmm14 | |
packuswb %xmm0, %xmm14 | |
movaps %xmm14, -16(%rax) | |
cmpq %rax, %rdx | |
jne .L4 | |
movl $11, %ecx | |
movl $833, %ebp | |
xorl %edi, %edi | |
call srand | |
leaq arr(%rip), %rbx | |
movl $1321528399, %esi | |
.p2align 4,,10 | |
.L6: | |
movl %edi, %eax | |
movl %edi, %ecx | |
movl %ebp, %r8d | |
imull %esi | |
movl %edi, %eax | |
sarl $31, %eax | |
sarl $2, %edx | |
subl %eax, %edx | |
leal (%rdx,%rdx,2), %eax | |
leal (%rdx,%rax,4), %eax | |
subl %eax, %ecx | |
sall %cl, %r8d | |
movl %r8d, %eax | |
movl %r8d, %edx | |
movl %r8d, (%rbx,%rdi,4) | |
sarl %eax | |
andl $1431655765, %edx | |
andl $1431655765, %eax | |
addl %edx, %eax | |
movl %eax, %ecx | |
sarl $2, %eax | |
andl $858993459, %eax | |
andl $858993459, %ecx | |
addl %eax, %ecx | |
movl %ecx, %edx | |
sarl $4, %ecx | |
andl $252645135, %ecx | |
andl $252645135, %edx | |
addl %ecx, %edx | |
movl %r8d, %ecx | |
movl %edx, %eax | |
sarl $8, %edx | |
andl $16711935, %edx | |
andl $16711935, %eax | |
sarl $16, %ecx | |
addl %edx, %eax | |
movq %r8, %rdx | |
movzbl %r8b, %r8d | |
movzbl %dh, %edx | |
movzwl %ax, %r9d | |
shrl $16, %eax | |
movsbl (%r12,%rdx), %r10d | |
addl %r9d, %eax | |
movsbl (%r12,%r8), %edx | |
leal (%r10,%rdx), %r8d | |
movzbl %ch, %edx | |
movzbl %cl, %ecx | |
movsbl (%r12,%rdx), %edx | |
movsbl (%r12,%rcx), %ecx | |
addl %ecx, %edx | |
addl %r8d, %edx | |
cmpl %eax, %edx | |
je .L5 | |
leaq .LC10(%rip), %rdx | |
movl $32, %r8d | |
leaq .LC11(%rip), %rcx | |
call _assert | |
.L5: | |
addq $1, %rdi | |
addl $1, %ebp | |
cmpq $104857600, %rdi | |
jne .L6 | |
leaq .LC12(%rip), %rcx | |
movl $10, %esi | |
xorl %r13d, %r13d | |
call _Z6printfPKcz | |
movdqa .LC13(%rip), %xmm9 | |
movdqa .LC14(%rip), %xmm10 | |
.p2align 4,,10 | |
.L10: | |
call clock | |
leaq arr(%rip), %r14 | |
pxor %xmm4, %xmm4 | |
leaq 419430400(%rbx), %r15 | |
movl %eax, %ebp | |
movq %r14, %rbx | |
movq %r14, %rax | |
.p2align 4,,10 | |
.L7: | |
movdqa (%rax), %xmm0 | |
addq $16, %rax | |
cmpq %rax, %r15 | |
movdqa %xmm0, %xmm3 | |
pand %xmm6, %xmm0 | |
psrad $1, %xmm3 | |
pand %xmm6, %xmm3 | |
paddd %xmm0, %xmm3 | |
movdqa %xmm3, %xmm2 | |
pand %xmm7, %xmm3 | |
psrad $2, %xmm2 | |
pand %xmm7, %xmm2 | |
paddd %xmm3, %xmm2 | |
movdqa %xmm2, %xmm1 | |
pand %xmm8, %xmm2 | |
psrad $4, %xmm1 | |
pand %xmm8, %xmm1 | |
paddd %xmm2, %xmm1 | |
movdqa %xmm1, %xmm0 | |
pand %xmm9, %xmm1 | |
psrad $8, %xmm0 | |
pand %xmm9, %xmm0 | |
paddd %xmm1, %xmm0 | |
movdqa %xmm0, %xmm1 | |
pand %xmm10, %xmm0 | |
psrld $16, %xmm1 | |
paddd %xmm1, %xmm0 | |
paddd %xmm0, %xmm4 | |
jne .L7 | |
movdqa %xmm4, %xmm0 | |
psrldq $8, %xmm0 | |
paddd %xmm0, %xmm4 | |
movdqa %xmm4, %xmm0 | |
psrldq $4, %xmm0 | |
paddd %xmm0, %xmm4 | |
movd %xmm4, %eax | |
addl %eax, %r13d | |
call clock | |
leaq arr(%rip), %r8 | |
movl %eax, %edi | |
.p2align 4,,10 | |
.L8: | |
movl (%r8), %edx | |
addq $4, %r8 | |
movzbl %dh, %eax | |
movl %edx, %ecx | |
movzbl %dl, %edx | |
movsbl (%r12,%rax), %r9d | |
sarl $16, %ecx | |
movsbl (%r12,%rdx), %eax | |
addl %eax, %r9d | |
movzbl %ch, %eax | |
movzbl %cl, %ecx | |
movsbl (%r12,%rax), %eax | |
movsbl (%r12,%rcx), %edx | |
addl %edx, %eax | |
addl %r9d, %eax | |
addl %eax, %r13d | |
cmpq %r8, %r15 | |
jne .L8 | |
call clock | |
movl %eax, %r8d | |
.p2align 4,,10 | |
.L9: | |
popcntl (%r14), %eax | |
addq $4, %r14 | |
addl %eax, %r13d | |
cmpq %r14, %r15 | |
jne .L9 | |
movl %r8d, 44(%rsp) | |
call clock | |
movl 44(%rsp), %r8d | |
movl %edi, %edx | |
leaq .LC0(%rip), %rcx | |
subl %ebp, %edx | |
subl %r8d, %eax | |
subl %edi, %r8d | |
movl %eax, %r9d | |
call _Z6printfPKcz.constprop.0 | |
subl $1, %esi | |
jne .L10 | |
leaq aarr(%rip), %rax | |
pxor %xmm0, %xmm0 | |
leaq 419430400(%rax), %rdx | |
.p2align 4,,10 | |
.L11: | |
paddd (%rax), %xmm0 | |
addq $16, %rax | |
cmpq %rax, %rdx | |
jne .L11 | |
leaq .LC15(%rip), %rcx | |
movdqa %xmm0, %xmm1 | |
psrldq $8, %xmm1 | |
paddd %xmm1, %xmm0 | |
movdqa %xmm0, %xmm1 | |
psrldq $4, %xmm1 | |
paddd %xmm1, %xmm0 | |
movd %xmm0, %edx | |
addl %r13d, %edx | |
call _Z6printfPKcz | |
nop | |
movaps 48(%rsp), %xmm6 | |
xorl %eax, %eax | |
movaps 64(%rsp), %xmm7 | |
movaps 80(%rsp), %xmm8 | |
movaps 96(%rsp), %xmm9 | |
movaps 112(%rsp), %xmm10 | |
movaps 128(%rsp), %xmm11 | |
movaps 144(%rsp), %xmm12 | |
movaps 160(%rsp), %xmm13 | |
movaps 176(%rsp), %xmm14 | |
addq $200, %rsp | |
popq %rbx | |
popq %rsi | |
popq %rdi | |
popq %rbp | |
popq %r12 | |
popq %r13 | |
popq %r14 | |
popq %r15 | |
ret | |
.seh_endproc | |
.globl aarr | |
.bss | |
.align 32 | |
aarr: | |
.space 419430400 | |
.globl arr | |
.align 32 | |
arr: | |
.space 419430400 | |
.globl table | |
.align 32 | |
table: | |
.space 256 | |
.section .rdata,"dr" | |
.align 16 | |
.LC1: | |
.long 0 | |
.long 1 | |
.long 2 | |
.long 3 | |
.align 16 | |
.LC2: | |
.long 16 | |
.long 16 | |
.long 16 | |
.long 16 | |
.align 16 | |
.LC3: | |
.long 4 | |
.long 4 | |
.long 4 | |
.long 4 | |
.align 16 | |
.LC4: | |
.long 8 | |
.long 8 | |
.long 8 | |
.long 8 | |
.align 16 | |
.LC5: | |
.long 12 | |
.long 12 | |
.long 12 | |
.long 12 | |
.align 16 | |
.LC6: | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.long 1431655765 | |
.align 16 | |
.LC7: | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.long 858993459 | |
.align 16 | |
.LC8: | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.long 252645135 | |
.align 16 | |
.LC9: | |
.word 255 | |
.word 255 | |
.word 255 | |
.word 255 | |
.word 255 | |
.word 255 | |
.word 255 | |
.word 255 | |
.align 16 | |
.LC13: | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.long 16711935 | |
.align 16 | |
.LC14: | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.long 65535 | |
.ident "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 5.3.0" | |
.def __mingw_vprintf; .scl 2; .type 32; .endef | |
.def srand; .scl 2; .type 32; .endef | |
.def _assert; .scl 2; .type 32; .endef | |
.def clock; .scl 2; .type 32; .endef #asd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment