Skip to content

Instantly share code, notes, and snippets.

@comzyh
Last active April 19, 2018 14:55
Show Gist options
  • Save comzyh/5ba5ebeb0841661d1b006d3291404464 to your computer and use it in GitHub Desktop.
Save comzyh/5ba5ebeb0841661d1b006d3291404464 to your computer and use it in GitHub Desktop.
Bit Count test
.file "a.cpp"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC10:
.string "a.cpp"
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC11:
.string "bits1(arr[i]) == bits2(arr[i])"
.section .rodata.str1.1
.LC12:
.string "start to run."
.section .rodata.str1.8
.align 8
.LC14:
.string "bits1: %5lu, bits2: %5lu, builtin: %5lu\n"
.section .rodata.str1.1
.LC15:
.string "%d\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB86:
.cfi_startproc
leaq 8(%rsp), %r10
.cfi_def_cfa 10, 0
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
movq %rsp, %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
.cfi_escape 0x10,0xf,0x2,0x76,0x78
.cfi_escape 0x10,0xe,0x2,0x76,0x70
.cfi_escape 0x10,0xd,0x2,0x76,0x68
.cfi_escape 0x10,0xc,0x2,0x76,0x60
leaq table(%rip), %r14
pushq %r10
.cfi_escape 0xf,0x3,0x76,0x58,0x6
pushq %rbx
leaq 256(%r14), %rdx
movq %r14, %rax
subq $288, %rsp
.cfi_escape 0x10,0x3,0x2,0x76,0x50
vmovdqa .LC0(%rip), %ymm4
vmovdqa .LC1(%rip), %ymm11
vmovdqa .LC2(%rip), %ymm10
vmovdqa .LC3(%rip), %ymm9
vmovdqa .LC4(%rip), %ymm8
vmovdqa .LC5(%rip), %ymm2
vmovdqa .LC6(%rip), %ymm3
vmovdqa .LC7(%rip), %ymm5
vmovdqa .LC8(%rip), %ymm7
vmovdqa .LC9(%rip), %ymm6
.L2:
vpand %ymm2, %ymm4, %ymm13
vpsrad $1, %ymm4, %ymm0
vpaddd %ymm10, %ymm4, %ymm12
vpand %ymm2, %ymm0, %ymm0
vpaddd %ymm9, %ymm4, %ymm14
vpaddd %ymm8, %ymm4, %ymm1
addq $32, %rax
vpaddd %ymm11, %ymm4, %ymm4
vpaddd %ymm13, %ymm0, %ymm0
vpsrad $1, %ymm12, %ymm13
vpand %ymm2, %ymm12, %ymm12
vpand %ymm2, %ymm13, %ymm13
vpaddd %ymm12, %ymm13, %ymm13
vpsrad $1, %ymm14, %ymm12
vpand %ymm2, %ymm14, %ymm14
vpand %ymm2, %ymm12, %ymm12
vpaddd %ymm14, %ymm12, %ymm12
vpsrad $1, %ymm1, %ymm14
vpand %ymm2, %ymm1, %ymm1
vpand %ymm2, %ymm14, %ymm14
vpaddd %ymm1, %ymm14, %ymm14
vpsrad $2, %ymm0, %ymm1
vpand %ymm3, %ymm0, %ymm0
vpand %ymm3, %ymm1, %ymm1
vpaddd %ymm0, %ymm1, %ymm1
vpsrad $2, %ymm13, %ymm0
vpand %ymm3, %ymm13, %ymm13
vpand %ymm3, %ymm0, %ymm0
vpaddd %ymm13, %ymm0, %ymm0
vpsrad $2, %ymm12, %ymm13
vpand %ymm3, %ymm12, %ymm12
vpand %ymm3, %ymm13, %ymm13
vpaddd %ymm12, %ymm13, %ymm13
vpsrad $2, %ymm14, %ymm12
vpand %ymm3, %ymm14, %ymm14
vpand %ymm3, %ymm12, %ymm12
vpaddd %ymm14, %ymm12, %ymm12
vpsrad $4, %ymm1, %ymm14
vpand %ymm5, %ymm1, %ymm1
vpaddd %ymm1, %ymm14, %ymm1
vpsrad $4, %ymm0, %ymm14
vpand %ymm5, %ymm0, %ymm0
vpaddd %ymm0, %ymm14, %ymm14
vpand %ymm1, %ymm7, %ymm0
vpand %ymm14, %ymm7, %ymm1
vpackusdw %ymm1, %ymm0, %ymm0
vpsrad $4, %ymm13, %ymm1
vpand %ymm5, %ymm13, %ymm13
vpermq $216, %ymm0, %ymm0
vpaddd %ymm13, %ymm1, %ymm1
vpsrad $4, %ymm12, %ymm13
vpand %ymm5, %ymm12, %ymm12
vpand %ymm1, %ymm7, %ymm1
vpand %ymm0, %ymm6, %ymm0
vpaddd %ymm12, %ymm13, %ymm12
vpand %ymm12, %ymm7, %ymm12
vpackusdw %ymm12, %ymm1, %ymm1
vpermq $216, %ymm1, %ymm1
vpand %ymm1, %ymm6, %ymm1
vpackuswb %ymm1, %ymm0, %ymm0
vpermq $216, %ymm0, %ymm0
vmovdqa %ymm0, -32(%rax)
cmpq %rax, %rdx
jne .L2
movl $11, %edi
vmovdqa %ymm7, -176(%rbp)
vmovdqa %ymm5, -144(%rbp)
vmovdqa %ymm3, -112(%rbp)
vmovdqa %ymm2, -80(%rbp)
vzeroupper
leaq arr(%rip), %r12
call srand@PLT
vmovdqa -176(%rbp), %ymm7
xorl %ecx, %ecx
movl $1321528399, %esi
vmovdqa -144(%rbp), %ymm5
vmovdqa -112(%rbp), %ymm3
vmovdqa -80(%rbp), %ymm2
.p2align 4,,10
.p2align 3
.L4:
movl %ecx, %eax
leal 833(%rcx), %edi
movl %ecx, %ebx
mull %esi
shrl $2, %edx
leal (%rdx,%rdx,2), %eax
leal (%rdx,%rax,4), %eax
subl %eax, %ebx
shlx %ebx, %edi, %eax
movl %eax, %edx
movl %eax, %edi
movl %eax, (%r12,%rcx,4)
sarl %edx
andl $1431655765, %edi
andl $1431655765, %edx
addl %edi, %edx
movl %edx, %edi
sarl $2, %edx
andl $858993459, %edi
andl $858993459, %edx
addl %edi, %edx
movl %edx, %edi
sarl $4, %edx
andl $252645135, %edi
andl $252645135, %edx
addl %edi, %edx
movl %edx, %edi
sarl $8, %edx
andl $16711935, %edi
andl $16711935, %edx
addl %edi, %edx
movzbl %ah, %edi
movsbl (%r14,%rdi), %r10d
movzbl %al, %edi
movzwl %dx, %r8d
movsbl (%r14,%rdi), %edi
shrl $16, %edx
addl %r8d, %edx
addl %r10d, %edi
movl %eax, %r10d
shrl $24, %eax
sarl $16, %r10d
movsbl (%r14,%rax), %eax
movzbl %r10b, %r10d
movsbl (%r14,%r10), %r10d
addl %r10d, %eax
addl %edi, %eax
cmpl %edx, %eax
jne .L17
addq $1, %rcx
cmpq $104857600, %rcx
jne .L4
leaq .LC12(%rip), %rdi
vmovdqa %ymm7, -176(%rbp)
vmovdqa %ymm5, -144(%rbp)
vmovdqa %ymm3, -112(%rbp)
vmovdqa %ymm2, -80(%rbp)
vzeroupper
xorl %r15d, %r15d
call puts@PLT
vmovdqa .LC13(%rip), %ymm6
vmovdqa -176(%rbp), %ymm7
movl $10, -308(%rbp)
vmovdqa -144(%rbp), %ymm5
vmovdqa -112(%rbp), %ymm3
vmovdqa -80(%rbp), %ymm2
.p2align 4,,10
.p2align 3
.L8:
vmovdqa %ymm6, -240(%rbp)
vmovdqa %ymm7, -208(%rbp)
vmovdqa %ymm5, -176(%rbp)
vmovdqa %ymm3, -144(%rbp)
vmovdqa %ymm2, -112(%rbp)
vzeroupper
leaq arr(%rip), %rbx
call clock@PLT
vpxor %xmm4, %xmm4, %xmm4
vmovdqa -240(%rbp), %ymm6
vmovdqa -208(%rbp), %ymm7
movq %rax, -80(%rbp)
movq %rbx, %r13
vmovdqa -176(%rbp), %ymm5
leaq 419430400(%r12), %rax
movq %rbx, %r12
vmovdqa -144(%rbp), %ymm3
vmovdqa -112(%rbp), %ymm2
.p2align 4,,10
.p2align 3
.L5:
vmovdqa 0(%r13), %ymm0
addq $32, %r13
cmpq %rax, %r13
vpsrad $1, %ymm0, %ymm1
vpand %ymm2, %ymm0, %ymm0
vpand %ymm2, %ymm1, %ymm1
vpaddd %ymm0, %ymm1, %ymm1
vpsrad $2, %ymm1, %ymm0
vpand %ymm3, %ymm1, %ymm1
vpand %ymm3, %ymm0, %ymm0
vpaddd %ymm1, %ymm0, %ymm0
vpsrad $4, %ymm0, %ymm1
vpand %ymm5, %ymm0, %ymm0
vpand %ymm5, %ymm1, %ymm1
vpaddd %ymm0, %ymm1, %ymm1
vpsrad $8, %ymm1, %ymm0
vpand %ymm6, %ymm1, %ymm1
vpand %ymm6, %ymm0, %ymm0
vpaddd %ymm1, %ymm0, %ymm0
vpsrld $16, %ymm0, %ymm1
vpand %ymm7, %ymm0, %ymm0
vpaddd %ymm0, %ymm1, %ymm0
vpaddd %ymm0, %ymm4, %ymm4
jne .L5
vmovdqa %ymm7, -240(%rbp)
vpxor %xmm7, %xmm7, %xmm7
vmovdqa %ymm6, -272(%rbp)
vmovdqa %ymm5, -208(%rbp)
vperm2i128 $33, %ymm7, %ymm4, %ymm0
vmovdqa %ymm3, -176(%rbp)
vmovdqa %ymm2, -144(%rbp)
vpaddd %ymm4, %ymm0, %ymm0
vperm2i128 $33, %ymm7, %ymm0, %ymm1
vpalignr $8, %ymm0, %ymm1, %ymm1
vpaddd %ymm1, %ymm0, %ymm0
vperm2i128 $33, %ymm7, %ymm0, %ymm1
vpalignr $4, %ymm0, %ymm1, %ymm1
vpaddd %ymm1, %ymm0, %ymm0
vmovd %xmm0, %eax
addl %eax, %r15d
vzeroupper
call clock@PLT
leaq arr(%rip), %rdx
vmovdqa -272(%rbp), %ymm6
movq %rax, -112(%rbp)
vmovdqa -240(%rbp), %ymm7
vmovdqa -208(%rbp), %ymm5
vmovdqa -176(%rbp), %ymm3
vmovdqa -144(%rbp), %ymm2
.p2align 4,,10
.p2align 3
.L6:
movl (%rdx), %eax
addq $4, %rdx
movzbl %ah, %edi
movsbl (%r14,%rdi), %r8d
movzbl %al, %edi
movsbl (%r14,%rdi), %edi
addl %r8d, %edi
movl %eax, %r8d
shrl $24, %eax
sarl $16, %r8d
movsbl (%r14,%rax), %eax
movzbl %r8b, %r8d
movsbl (%r14,%r8), %r8d
addl %r8d, %eax
addl %edi, %eax
addl %eax, %r15d
cmpq %rdx, %r13
jne .L6
vmovdqa %ymm6, -304(%rbp)
movq %rdx, -144(%rbp)
vmovdqa %ymm7, -272(%rbp)
vmovdqa %ymm5, -240(%rbp)
vmovdqa %ymm3, -208(%rbp)
vmovdqa %ymm2, -176(%rbp)
vzeroupper
call clock@PLT
movq -144(%rbp), %rdx
movq %rax, %r13
vmovdqa -304(%rbp), %ymm6
vmovdqa -272(%rbp), %ymm7
vmovdqa -240(%rbp), %ymm5
vmovdqa -208(%rbp), %ymm3
vmovdqa -176(%rbp), %ymm2
.p2align 4,,10
.p2align 3
.L7:
xorl %eax, %eax
popcntl (%rbx), %eax
addq $4, %rbx
addl %eax, %r15d
cmpq %rbx, %rdx
jne .L7
vmovdqa %ymm6, -272(%rbp)
vmovdqa %ymm7, -240(%rbp)
vmovdqa %ymm5, -208(%rbp)
vmovdqa %ymm3, -176(%rbp)
vmovdqa %ymm2, -144(%rbp)
vzeroupper
call clock@PLT
movq -112(%rbp), %rdx
movq %r13, %rcx
subq %r13, %rax
leaq .LC14(%rip), %rsi
movq %rax, %r8
movl $1, %edi
xorl %eax, %eax
subq %rdx, %rcx
subq -80(%rbp), %rdx
call __printf_chk@PLT
subl $1, -308(%rbp)
vmovdqa -144(%rbp), %ymm2
vmovdqa -176(%rbp), %ymm3
vmovdqa -208(%rbp), %ymm5
vmovdqa -240(%rbp), %ymm7
vmovdqa -272(%rbp), %ymm6
jne .L8
leaq .LC15(%rip), %rsi
movl %r15d, %edx
movl $1, %edi
xorl %eax, %eax
vzeroupper
call __printf_chk@PLT
addq $288, %rsp
xorl %eax, %eax
popq %rbx
popq %r10
.cfi_remember_state
.cfi_def_cfa 10, 0
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
leaq -8(%r10), %rsp
.cfi_def_cfa 7, 8
ret
.L17:
.cfi_restore_state
leaq _ZZ4mainE19__PRETTY_FUNCTION__(%rip), %rcx
leaq .LC10(%rip), %rsi
leaq .LC11(%rip), %rdi
movl $32, %edx
vzeroupper
call __assert_fail@PLT
.cfi_endproc
.LFE86:
.size main, .-main
.section .rodata
.align 8
.type _ZZ4mainE19__PRETTY_FUNCTION__, @object
.size _ZZ4mainE19__PRETTY_FUNCTION__, 11
_ZZ4mainE19__PRETTY_FUNCTION__:
.string "int main()"
.globl arr
.bss
.align 32
.type arr, @object
.size arr, 419430400
arr:
.zero 419430400
.globl table
.align 32
.type table, @object
.size table, 256
table:
.zero 256
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.long 0
.long 1
.long 2
.long 3
.long 4
.long 5
.long 6
.long 7
.align 32
.LC1:
.long 32
.long 32
.long 32
.long 32
.long 32
.long 32
.long 32
.long 32
.align 32
.LC2:
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.align 32
.LC3:
.long 16
.long 16
.long 16
.long 16
.long 16
.long 16
.long 16
.long 16
.align 32
.LC4:
.long 24
.long 24
.long 24
.long 24
.long 24
.long 24
.long 24
.long 24
.align 32
.LC5:
.long 1431655765
.long 1431655765
.long 1431655765
.long 1431655765
.long 1431655765
.long 1431655765
.long 1431655765
.long 1431655765
.align 32
.LC6:
.long 858993459
.long 858993459
.long 858993459
.long 858993459
.long 858993459
.long 858993459
.long 858993459
.long 858993459
.align 32
.LC7:
.long 252645135
.long 252645135
.long 252645135
.long 252645135
.long 252645135
.long 252645135
.long 252645135
.long 252645135
.align 32
.LC8:
.long 65535
.long 65535
.long 65535
.long 65535
.long 65535
.long 65535
.long 65535
.long 65535
.align 32
.LC9:
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.value 255
.align 32
.LC13:
.long 16711935
.long 16711935
.long 16711935
.long 16711935
.long 16711935
.long 16711935
.long 16711935
.long 16711935
.ident "GCC: (Ubuntu 7.3.0-15ubuntu2) 7.3.0"
.section .note.GNU-stack,"",@progbits
#include <cstdio>
#include <cstring>
#include <time.h>
#include <cassert>
#include <cstdlib>
char table[256];
int arr[100 << 20];
inline int bits1(int x) {
x = (x & 0x55555555) + ((x >> 1 ) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2 ) & 0x33333333);
x = (x & 0x0f0f0f0f) + ((x >> 4 ) & 0x0f0f0f0f);
x = (x & 0x00ff00ff) + ((x >> 8 ) & 0x00ff00ff);
x = (x & 0x0000ffff) + ((x >> 16) & 0x0000ffff);
return x;
}
inline int bits2(int x) {
int ans = 0;
ans += table[x & 0xff] + table[(x & 0xff00) >> 8];
x >>= 16;
ans += table[x & 0xff] + table[(x & 0xff00) >> 8];
return ans;
}
int main()
{
for (int i = 0; i < 256; i++) {
table[i] = bits1(i);
}
srand (11);
for (int i = 0; i < sizeof(arr) / sizeof(int); i++) {
arr[i] = (0x341 + i) << (i % 13);
assert(bits1(arr[i]) == bits2(arr[i]));
}
printf("start to run.\n");
int nonsense = 0;
const int N = sizeof(arr) / sizeof(int);
for (int epoch = 0; epoch < 10; epoch ++) {
clock_t start = clock();
for (int i = 0; i < N; i++)
nonsense += bits1(arr[i]);
clock_t t1 = clock();
for (int i = 0; i < N; i++)
nonsense += bits2(arr[i]);
clock_t t2 = clock();
for (int i = 0; i < N; i++)
nonsense += __builtin_popcount(arr[i]);
clock_t t3 = clock();
printf("bits1: %5lu, bits2: %5lu, builtin: %5lu\n", t1 - start, t2 - t1, t3 - t2);
}
printf("%d\n", nonsense);
}
.file "A.cpp"
.section .rdata,"dr"
.align 8
.LC0:
.ascii "bits1: %5lu, bits2: %5lu, builtin: %5lu\12\0"
.text
.p2align 4,,15
.def _Z6printfPKcz.constprop.0; .scl 3; .type 32; .endef
.seh_proc _Z6printfPKcz.constprop.0
_Z6printfPKcz.constprop.0:
.LFB91:
subq $56, %rsp
.seh_stackalloc 56
.seh_endprologue
leaq .LC0(%rip), %rcx
movq %rdx, 72(%rsp)
leaq 72(%rsp), %rdx
movq %r8, 80(%rsp)
movq %r9, 88(%rsp)
movq %rdx, 40(%rsp)
call __mingw_vprintf
addq $56, %rsp
ret
.seh_endproc
.section .text$_Z6printfPKcz,"x"
.linkonce discard
.p2align 4,,15
.globl _Z6printfPKcz
.def _Z6printfPKcz; .scl 2; .type 32; .endef
.seh_proc _Z6printfPKcz
_Z6printfPKcz:
.LFB8:
subq $56, %rsp
.seh_stackalloc 56
.seh_endprologue
movq %rdx, 72(%rsp)
leaq 72(%rsp), %rdx
movq %r8, 80(%rsp)
movq %r9, 88(%rsp)
movq %rdx, 40(%rsp)
call __mingw_vprintf
addq $56, %rsp
ret
.seh_endproc
.def __main; .scl 2; .type 32; .endef
.section .rdata,"dr"
.col0:
.ascii "A.cpp\0"
.align 8
.LC11:
.ascii "bits1(arr[i]) == bits2(arr[i])\0"
.LC12:
.ascii "start to run.\12\0"
.LC15:
.ascii "%d\12\0"
.section .text.startup,"x"
.p2align 4,,15
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
.LFB90:
pushq %r15
.seh_pushreg %r15
pushq %r14
.seh_pushreg %r14
pushq %r13
.seh_pushreg %r13
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
subq $200, %rsp
.seh_stackalloc 200
movaps %xmm6, 48(%rsp)
.seh_savexmm %xmm6, 48
movaps %xmm7, 64(%rsp)
.seh_savexmm %xmm7, 64
movaps %xmm8, 80(%rsp)
.seh_savexmm %xmm8, 80
movaps %xmm9, 96(%rsp)
.seh_savexmm %xmm9, 96
movaps %xmm10, 112(%rsp)
.seh_savexmm %xmm10, 112
movaps %xmm11, 128(%rsp)
.seh_savexmm %xmm11, 128
movaps %xmm12, 144(%rsp)
.seh_savexmm %xmm12, 144
movaps %xmm13, 160(%rsp)
.seh_savexmm %xmm13, 160
movaps %xmm14, 176(%rsp)
.seh_savexmm %xmm14, 176
.seh_endprologue
leaq table(%rip), %r12
call __main
leaq 256+table(%rip), %rdx
movq %r12, %rax
movdqa .LC1(%rip), %xmm2
movdqa .LC2(%rip), %xmm10
movdqa .LC3(%rip), %xmm9
movdqa .LC4(%rip), %xmm5
movdqa .LC5(%rip), %xmm4
movdqa .LC6(%rip), %xmm6
movdqa .LC7(%rip), %xmm7
movdqa .LC8(%rip), %xmm8
movdqa .LC9(%rip), %xmm3
.L4:
movdqa %xmm2, %xmm12
movdqa %xmm2, %xmm1
movdqa %xmm2, %xmm0
psrad $1, %xmm12
paddd %xmm9, %xmm1
paddd %xmm5, %xmm0
pand %xmm6, %xmm12
movdqa %xmm12, %xmm11
movdqa %xmm2, %xmm12
movdqa %xmm2, %xmm13
pand %xmm6, %xmm12
paddd %xmm11, %xmm12
movdqa %xmm1, %xmm11
pand %xmm6, %xmm1
psrad $1, %xmm11
paddd %xmm4, %xmm13
movdqa %xmm12, %xmm14
pand %xmm7, %xmm12
pand %xmm6, %xmm11
paddd %xmm1, %xmm11
movdqa %xmm0, %xmm1
pand %xmm6, %xmm0
psrad $1, %xmm1
addq $16, %rax
paddd %xmm10, %xmm2
pand %xmm6, %xmm1
paddd %xmm0, %xmm1
movdqa %xmm13, %xmm0
pand %xmm6, %xmm13
psrad $1, %xmm0
pand %xmm6, %xmm0
paddd %xmm13, %xmm0
movdqa %xmm11, %xmm13
pand %xmm7, %xmm11
psrad $2, %xmm13
psrad $2, %xmm14
pand %xmm7, %xmm13
paddd %xmm11, %xmm13
movdqa %xmm1, %xmm11
pand %xmm7, %xmm14
psrad $2, %xmm11
paddd %xmm12, %xmm14
movdqa %xmm0, %xmm12
pand %xmm7, %xmm11
pand %xmm7, %xmm1
psrad $2, %xmm12
pand %xmm7, %xmm0
paddd %xmm11, %xmm1
movdqa %xmm14, %xmm11
pand %xmm7, %xmm12
paddd %xmm0, %xmm12
psrad $4, %xmm11
movdqa %xmm13, %xmm0
pand %xmm8, %xmm14
pand %xmm8, %xmm13
paddd %xmm14, %xmm11
psrad $4, %xmm0
paddd %xmm13, %xmm0
movdqa %xmm11, %xmm13
punpckhwd %xmm0, %xmm13
punpcklwd %xmm0, %xmm11
movdqa %xmm11, %xmm0
punpcklwd %xmm13, %xmm11
punpckhwd %xmm13, %xmm0
punpcklwd %xmm0, %xmm11
movdqa %xmm1, %xmm0
pand %xmm8, %xmm1
psrad $4, %xmm0
paddd %xmm1, %xmm0
movdqa %xmm12, %xmm1
pand %xmm8, %xmm12
psrad $4, %xmm1
paddd %xmm12, %xmm1
movdqa %xmm0, %xmm12
punpckhwd %xmm1, %xmm12
punpcklwd %xmm1, %xmm0
movdqa %xmm0, %xmm1
punpcklwd %xmm12, %xmm0
punpckhwd %xmm12, %xmm1
punpcklwd %xmm1, %xmm0
movdqa %xmm11, %xmm1
pand %xmm3, %xmm0
pand %xmm3, %xmm1
movdqa %xmm1, %xmm14
packuswb %xmm0, %xmm14
movaps %xmm14, -16(%rax)
cmpq %rax, %rdx
jne .L4
movl $11, %ecx
movl $833, %ebp
xorl %edi, %edi
call srand
leaq arr(%rip), %rbx
movl $1321528399, %esi
.p2align 4,,10
.L6:
movl %edi, %eax
movl %edi, %ecx
movl %ebp, %r8d
imull %esi
movl %edi, %eax
sarl $31, %eax
sarl $2, %edx
subl %eax, %edx
leal (%rdx,%rdx,2), %eax
leal (%rdx,%rax,4), %eax
subl %eax, %ecx
sall %cl, %r8d
movl %r8d, %eax
movl %r8d, %edx
movl %r8d, (%rbx,%rdi,4)
sarl %eax
andl $1431655765, %edx
andl $1431655765, %eax
addl %edx, %eax
movl %eax, %ecx
sarl $2, %eax
andl $858993459, %eax
andl $858993459, %ecx
addl %eax, %ecx
movl %ecx, %edx
sarl $4, %ecx
andl $252645135, %ecx
andl $252645135, %edx
addl %ecx, %edx
movl %r8d, %ecx
movl %edx, %eax
sarl $8, %edx
andl $16711935, %edx
andl $16711935, %eax
sarl $16, %ecx
addl %edx, %eax
movq %r8, %rdx
movzbl %r8b, %r8d
movzbl %dh, %edx
movzwl %ax, %r9d
shrl $16, %eax
movsbl (%r12,%rdx), %r10d
addl %r9d, %eax
movsbl (%r12,%r8), %edx
leal (%r10,%rdx), %r8d
movzbl %ch, %edx
movzbl %cl, %ecx
movsbl (%r12,%rdx), %edx
movsbl (%r12,%rcx), %ecx
addl %ecx, %edx
addl %r8d, %edx
cmpl %eax, %edx
je .L5
leaq .LC10(%rip), %rdx
movl $32, %r8d
leaq .LC11(%rip), %rcx
call _assert
.L5:
addq $1, %rdi
addl $1, %ebp
cmpq $104857600, %rdi
jne .L6
leaq .LC12(%rip), %rcx
movl $10, %esi
xorl %r13d, %r13d
call _Z6printfPKcz
movdqa .LC13(%rip), %xmm9
movdqa .LC14(%rip), %xmm10
.p2align 4,,10
.L10:
call clock
leaq arr(%rip), %r14
pxor %xmm4, %xmm4
leaq 419430400(%rbx), %r15
movl %eax, %ebp
movq %r14, %rbx
movq %r14, %rax
.p2align 4,,10
.L7:
movdqa (%rax), %xmm0
addq $16, %rax
cmpq %rax, %r15
movdqa %xmm0, %xmm3
pand %xmm6, %xmm0
psrad $1, %xmm3
pand %xmm6, %xmm3
paddd %xmm0, %xmm3
movdqa %xmm3, %xmm2
pand %xmm7, %xmm3
psrad $2, %xmm2
pand %xmm7, %xmm2
paddd %xmm3, %xmm2
movdqa %xmm2, %xmm1
pand %xmm8, %xmm2
psrad $4, %xmm1
pand %xmm8, %xmm1
paddd %xmm2, %xmm1
movdqa %xmm1, %xmm0
pand %xmm9, %xmm1
psrad $8, %xmm0
pand %xmm9, %xmm0
paddd %xmm1, %xmm0
movdqa %xmm0, %xmm1
pand %xmm10, %xmm0
psrld $16, %xmm1
paddd %xmm1, %xmm0
paddd %xmm0, %xmm4
jne .L7
movdqa %xmm4, %xmm0
psrldq $8, %xmm0
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm0
psrldq $4, %xmm0
paddd %xmm0, %xmm4
movd %xmm4, %eax
addl %eax, %r13d
call clock
leaq arr(%rip), %r8
movl %eax, %edi
.p2align 4,,10
.L8:
movl (%r8), %edx
addq $4, %r8
movzbl %dh, %eax
movl %edx, %ecx
movzbl %dl, %edx
movsbl (%r12,%rax), %r9d
sarl $16, %ecx
movsbl (%r12,%rdx), %eax
addl %eax, %r9d
movzbl %ch, %eax
movzbl %cl, %ecx
movsbl (%r12,%rax), %eax
movsbl (%r12,%rcx), %edx
addl %edx, %eax
addl %r9d, %eax
addl %eax, %r13d
cmpq %r8, %r15
jne .L8
call clock
movl %eax, %r8d
.p2align 4,,10
.L9:
popcntl (%r14), %eax
addq $4, %r14
addl %eax, %r13d
cmpq %r14, %r15
jne .L9
movl %r8d, 44(%rsp)
call clock
movl 44(%rsp), %r8d
movl %edi, %edx
leaq .LC0(%rip), %rcx
subl %ebp, %edx
subl %r8d, %eax
subl %edi, %r8d
movl %eax, %r9d
call _Z6printfPKcz.constprop.0
subl $1, %esi
jne .L10
leaq aarr(%rip), %rax
pxor %xmm0, %xmm0
leaq 419430400(%rax), %rdx
.p2align 4,,10
.L11:
paddd (%rax), %xmm0
addq $16, %rax
cmpq %rax, %rdx
jne .L11
leaq .LC15(%rip), %rcx
movdqa %xmm0, %xmm1
psrldq $8, %xmm1
paddd %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrldq $4, %xmm1
paddd %xmm1, %xmm0
movd %xmm0, %edx
addl %r13d, %edx
call _Z6printfPKcz
nop
movaps 48(%rsp), %xmm6
xorl %eax, %eax
movaps 64(%rsp), %xmm7
movaps 80(%rsp), %xmm8
movaps 96(%rsp), %xmm9
movaps 112(%rsp), %xmm10
movaps 128(%rsp), %xmm11
movaps 144(%rsp), %xmm12
movaps 160(%rsp), %xmm13
movaps 176(%rsp), %xmm14
addq $200, %rsp
popq %rbx
popq %rsi
popq %rdi
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
.seh_endproc
.globl aarr
.bss
.align 32
aarr:
.space 419430400
.globl arr
.align 32
arr:
.space 419430400
.globl table
.align 32
table:
.space 256
.section .rdata,"dr"
.align 16
.LC1:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC2:
.long 16
.long 16
.long 16
.long 16
.align 16
.LC3:
.long 4
.long 4
.long 4
.long 4
.align 16
.LC4:
.long 8
.long 8
.long 8
.long 8
.align 16
.LC5:
.long 12
.long 12
.long 12
.long 12
.align 16
.LC6:
.long 1431655765
.long 1431655765
.long 1431655765
.long 1431655765
.align 16
.LC7:
.long 858993459
.long 858993459
.long 858993459
.long 858993459
.align 16
.LC8:
.long 252645135
.long 252645135
.long 252645135
.long 252645135
.align 16
.LC9:
.word 255
.word 255
.word 255
.word 255
.word 255
.word 255
.word 255
.word 255
.align 16
.LC13:
.long 16711935
.long 16711935
.long 16711935
.long 16711935
.align 16
.LC14:
.long 65535
.long 65535
.long 65535
.long 65535
.ident "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 5.3.0"
.def __mingw_vprintf; .scl 2; .type 32; .endef
.def srand; .scl 2; .type 32; .endef
.def _assert; .scl 2; .type 32; .endef
.def clock; .scl 2; .type 32; .endef #asd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment