Skip to content

Instantly share code, notes, and snippets.

@walkoncross
Forked from comzyh/bits-i7-7700HQ.s
Created December 29, 2016 01:31
Show Gist options
  • Save walkoncross/55c5f6c8642726bfcf6cc63670a84c6a to your computer and use it in GitHub Desktop.
Save walkoncross/55c5f6c8642726bfcf6cc63670a84c6a to your computer and use it in GitHub Desktop.
Bit Count test
#include <cstdio>
#include <cstring>
#include <time.h>
#include <cassert>
char table[256];
int arr[100 << 20];
inline int bits1(int x) {
x = (x & 0x55555555) + ((x >> 1 ) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2 ) & 0x33333333);
x = (x & 0x0f0f0f0f) + ((x >> 4 ) & 0x0f0f0f0f);
x = (x & 0x00ff00ff) + ((x >> 8 ) & 0x00ff00ff);
x = (x & 0x0000ffff) + ((x >> 16) & 0x0000ffff);
return x;
}
inline int bits2(int x) {
int ans = 0;
ans += table[x & 0xff] + table[(x & 0xff00) >> 8];
x >>= 16;
ans += table[x & 0xff] + table[(x & 0xff00) >> 8];
return ans;
}
int main()
{
for (int i = 0; i < 256; i++) {
table[i] = bits1(i);
}
srand (11);
for (int i = 0; i < sizeof(arr) / sizeof(int); i++) {
arr[i] = (0x341 + i) << (i % 13);
assert(bits1(arr[i]) == bits2(arr[i]));
}
printf("start to run.\n");
int nonsense = 0;
const int N = sizeof(arr) / sizeof(int);
for (int epoch = 0; epoch < 10; epoch ++) {
clock_t start = clock();
for (int i = 0; i < N; i++)
nonsense += bits1(arr[i]);
clock_t t1 = clock();
for (int i = 0; i < N; i++)
nonsense += bits2(arr[i]);
clock_t t2 = clock();
for (int i = 0; i < N; i++)
nonsense += __builtin_popcount(arr[i]);
clock_t t3 = clock();
printf("bits1: %5lu, bits2: %5lu, builtin: %5lu\n", t1 - start, t2 - t1, t3 - t2);
}
printf("%d\n", nonsense);
}
.file "A.cpp"
.section .rdata,"dr"
.align 8
.LC0:
.ascii "bits1: %5lu, bits2: %5lu, builtin: %5lu\12\0"
.text
.p2align 4,,15
.def _Z6printfPKcz.constprop.0; .scl 3; .type 32; .endef
.seh_proc _Z6printfPKcz.constprop.0
_Z6printfPKcz.constprop.0:
.LFB91:
subq $56, %rsp
.seh_stackalloc 56
.seh_endprologue
leaq .LC0(%rip), %rcx
movq %rdx, 72(%rsp)
leaq 72(%rsp), %rdx
movq %r8, 80(%rsp)
movq %r9, 88(%rsp)
movq %rdx, 40(%rsp)
call __mingw_vprintf
addq $56, %rsp
ret
.seh_endproc
.section .text$_Z6printfPKcz,"x"
.linkonce discard
.p2align 4,,15
.globl _Z6printfPKcz
.def _Z6printfPKcz; .scl 2; .type 32; .endef
.seh_proc _Z6printfPKcz
_Z6printfPKcz:
.LFB8:
subq $56, %rsp
.seh_stackalloc 56
.seh_endprologue
movq %rdx, 72(%rsp)
leaq 72(%rsp), %rdx
movq %r8, 80(%rsp)
movq %r9, 88(%rsp)
movq %rdx, 40(%rsp)
call __mingw_vprintf
addq $56, %rsp
ret
.seh_endproc
.def __main; .scl 2; .type 32; .endef
.section .rdata,"dr"
.col0:
.ascii "A.cpp\0"
.align 8
.LC11:
.ascii "bits1(arr[i]) == bits2(arr[i])\0"
.LC12:
.ascii "start to run.\12\0"
.LC15:
.ascii "%d\12\0"
.section .text.startup,"x"
.p2align 4,,15
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
.LFB90:
pushq %r15
.seh_pushreg %r15
pushq %r14
.seh_pushreg %r14
pushq %r13
.seh_pushreg %r13
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
subq $200, %rsp
.seh_stackalloc 200
movaps %xmm6, 48(%rsp)
.seh_savexmm %xmm6, 48
movaps %xmm7, 64(%rsp)
.seh_savexmm %xmm7, 64
movaps %xmm8, 80(%rsp)
.seh_savexmm %xmm8, 80
movaps %xmm9, 96(%rsp)
.seh_savexmm %xmm9, 96
movaps %xmm10, 112(%rsp)
.seh_savexmm %xmm10, 112
movaps %xmm11, 128(%rsp)
.seh_savexmm %xmm11, 128
movaps %xmm12, 144(%rsp)
.seh_savexmm %xmm12, 144
movaps %xmm13, 160(%rsp)
.seh_savexmm %xmm13, 160
movaps %xmm14, 176(%rsp)
.seh_savexmm %xmm14, 176
.seh_endprologue
leaq table(%rip), %r12
call __main
leaq 256+table(%rip), %rdx
movq %r12, %rax
movdqa .LC1(%rip), %xmm2
movdqa .LC2(%rip), %xmm10
movdqa .LC3(%rip), %xmm9
movdqa .LC4(%rip), %xmm5
movdqa .LC5(%rip), %xmm4
movdqa .LC6(%rip), %xmm6
movdqa .LC7(%rip), %xmm7
movdqa .LC8(%rip), %xmm8
movdqa .LC9(%rip), %xmm3
.L4:
movdqa %xmm2, %xmm12
movdqa %xmm2, %xmm1
movdqa %xmm2, %xmm0
psrad $1, %xmm12
paddd %xmm9, %xmm1
paddd %xmm5, %xmm0
pand %xmm6, %xmm12
movdqa %xmm12, %xmm11
movdqa %xmm2, %xmm12
movdqa %xmm2, %xmm13
pand %xmm6, %xmm12
paddd %xmm11, %xmm12
movdqa %xmm1, %xmm11
pand %xmm6, %xmm1
psrad $1, %xmm11
paddd %xmm4, %xmm13
movdqa %xmm12, %xmm14
pand %xmm7, %xmm12
pand %xmm6, %xmm11
paddd %xmm1, %xmm11
movdqa %xmm0, %xmm1
pand %xmm6, %xmm0
psrad $1, %xmm1
addq $16, %rax
paddd %xmm10, %xmm2
pand %xmm6, %xmm1
paddd %xmm0, %xmm1
movdqa %xmm13, %xmm0
pand %xmm6, %xmm13
psrad $1, %xmm0
pand %xmm6, %xmm0
paddd %xmm13, %xmm0
movdqa %xmm11, %xmm13
pand %xmm7, %xmm11
psrad $2, %xmm13
psrad $2, %xmm14
pand %xmm7, %xmm13
paddd %xmm11, %xmm13
movdqa %xmm1, %xmm11
pand %xmm7, %xmm14
psrad $2, %xmm11
paddd %xmm12, %xmm14
movdqa %xmm0, %xmm12
pand %xmm7, %xmm11
pand %xmm7, %xmm1
psrad $2, %xmm12
pand %xmm7, %xmm0
paddd %xmm11, %xmm1
movdqa %xmm14, %xmm11
pand %xmm7, %xmm12
paddd %xmm0, %xmm12
psrad $4, %xmm11
movdqa %xmm13, %xmm0
pand %xmm8, %xmm14
pand %xmm8, %xmm13
paddd %xmm14, %xmm11
psrad $4, %xmm0
paddd %xmm13, %xmm0
movdqa %xmm11, %xmm13
punpckhwd %xmm0, %xmm13
punpcklwd %xmm0, %xmm11
movdqa %xmm11, %xmm0
punpcklwd %xmm13, %xmm11
punpckhwd %xmm13, %xmm0
punpcklwd %xmm0, %xmm11
movdqa %xmm1, %xmm0
pand %xmm8, %xmm1
psrad $4, %xmm0
paddd %xmm1, %xmm0
movdqa %xmm12, %xmm1
pand %xmm8, %xmm12
psrad $4, %xmm1
paddd %xmm12, %xmm1
movdqa %xmm0, %xmm12
punpckhwd %xmm1, %xmm12
punpcklwd %xmm1, %xmm0
movdqa %xmm0, %xmm1
punpcklwd %xmm12, %xmm0
punpckhwd %xmm12, %xmm1
punpcklwd %xmm1, %xmm0
movdqa %xmm11, %xmm1
pand %xmm3, %xmm0
pand %xmm3, %xmm1
movdqa %xmm1, %xmm14
packuswb %xmm0, %xmm14
movaps %xmm14, -16(%rax)
cmpq %rax, %rdx
jne .L4
movl $11, %ecx
movl $833, %ebp
xorl %edi, %edi
call srand
leaq arr(%rip), %rbx
movl $1321528399, %esi
.p2align 4,,10
.L6:
movl %edi, %eax
movl %edi, %ecx
movl %ebp, %r8d
imull %esi
movl %edi, %eax
sarl $31, %eax
sarl $2, %edx
subl %eax, %edx
leal (%rdx,%rdx,2), %eax
leal (%rdx,%rax,4), %eax
subl %eax, %ecx
sall %cl, %r8d
movl %r8d, %eax
movl %r8d, %edx
movl %r8d, (%rbx,%rdi,4)
sarl %eax
andl $1431655765, %edx
andl $1431655765, %eax
addl %edx, %eax
movl %eax, %ecx
sarl $2, %eax
andl $858993459, %eax
andl $858993459, %ecx
addl %eax, %ecx
movl %ecx, %edx
sarl $4, %ecx
andl $252645135, %ecx
andl $252645135, %edx
addl %ecx, %edx
movl %r8d, %ecx
movl %edx, %eax
sarl $8, %edx
andl $16711935, %edx
andl $16711935, %eax
sarl $16, %ecx
addl %edx, %eax
movq %r8, %rdx
movzbl %r8b, %r8d
movzbl %dh, %edx
movzwl %ax, %r9d
shrl $16, %eax
movsbl (%r12,%rdx), %r10d
addl %r9d, %eax
movsbl (%r12,%r8), %edx
leal (%r10,%rdx), %r8d
movzbl %ch, %edx
movzbl %cl, %ecx
movsbl (%r12,%rdx), %edx
movsbl (%r12,%rcx), %ecx
addl %ecx, %edx
addl %r8d, %edx
cmpl %eax, %edx
je .L5
leaq .LC10(%rip), %rdx
movl $32, %r8d
leaq .LC11(%rip), %rcx
call _assert
.L5:
addq $1, %rdi
addl $1, %ebp
cmpq $104857600, %rdi
jne .L6
leaq .LC12(%rip), %rcx
movl $10, %esi
xorl %r13d, %r13d
call _Z6printfPKcz
movdqa .LC13(%rip), %xmm9
movdqa .LC14(%rip), %xmm10
.p2align 4,,10
.L10:
call clock
leaq arr(%rip), %r14
pxor %xmm4, %xmm4
leaq 419430400(%rbx), %r15
movl %eax, %ebp
movq %r14, %rbx
movq %r14, %rax
.p2align 4,,10
.L7:
movdqa (%rax), %xmm0
addq $16, %rax
cmpq %rax, %r15
movdqa %xmm0, %xmm3
pand %xmm6, %xmm0
psrad $1, %xmm3
pand %xmm6, %xmm3
paddd %xmm0, %xmm3
movdqa %xmm3, %xmm2
pand %xmm7, %xmm3
psrad $2, %xmm2
pand %xmm7, %xmm2
paddd %xmm3, %xmm2
movdqa %xmm2, %xmm1
pand %xmm8, %xmm2
psrad $4, %xmm1
pand %xmm8, %xmm1
paddd %xmm2, %xmm1
movdqa %xmm1, %xmm0
pand %xmm9, %xmm1
psrad $8, %xmm0
pand %xmm9, %xmm0
paddd %xmm1, %xmm0
movdqa %xmm0, %xmm1
pand %xmm10, %xmm0
psrld $16, %xmm1
paddd %xmm1, %xmm0
paddd %xmm0, %xmm4
jne .L7
movdqa %xmm4, %xmm0
psrldq $8, %xmm0
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm0
psrldq $4, %xmm0
paddd %xmm0, %xmm4
movd %xmm4, %eax
addl %eax, %r13d
call clock
leaq arr(%rip), %r8
movl %eax, %edi
.p2align 4,,10
.L8:
movl (%r8), %edx
addq $4, %r8
movzbl %dh, %eax
movl %edx, %ecx
movzbl %dl, %edx
movsbl (%r12,%rax), %r9d
sarl $16, %ecx
movsbl (%r12,%rdx), %eax
addl %eax, %r9d
movzbl %ch, %eax
movzbl %cl, %ecx
movsbl (%r12,%rax), %eax
movsbl (%r12,%rcx), %edx
addl %edx, %eax
addl %r9d, %eax
addl %eax, %r13d
cmpq %r8, %r15
jne .L8
call clock
movl %eax, %r8d
.p2align 4,,10
.L9:
popcntl (%r14), %eax
addq $4, %r14
addl %eax, %r13d
cmpq %r14, %r15
jne .L9
movl %r8d, 44(%rsp)
call clock
movl 44(%rsp), %r8d
movl %edi, %edx
leaq .LC0(%rip), %rcx
subl %ebp, %edx
subl %r8d, %eax
subl %edi, %r8d
movl %eax, %r9d
call _Z6printfPKcz.constprop.0
subl $1, %esi
jne .L10
leaq aarr(%rip), %rax
pxor %xmm0, %xmm0
leaq 419430400(%rax), %rdx
.p2align 4,,10
.L11:
paddd (%rax), %xmm0
addq $16, %rax
cmpq %rax, %rdx
jne .L11
leaq .LC15(%rip), %rcx
movdqa %xmm0, %xmm1
psrldq $8, %xmm1
paddd %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrldq $4, %xmm1
paddd %xmm1, %xmm0
movd %xmm0, %edx
addl %r13d, %edx
call _Z6printfPKcz
nop
movaps 48(%rsp), %xmm6
xorl %eax, %eax
movaps 64(%rsp), %xmm7
movaps 80(%rsp), %xmm8
movaps 96(%rsp), %xmm9
movaps 112(%rsp), %xmm10
movaps 128(%rsp), %xmm11
movaps 144(%rsp), %xmm12
movaps 160(%rsp), %xmm13
movaps 176(%rsp), %xmm14
addq $200, %rsp
popq %rbx
popq %rsi
popq %rdi
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
.seh_endproc
.globl aarr
.bss
.align 32
aarr:
.space 419430400
.globl arr
.align 32
arr:
.space 419430400
.globl table
.align 32
table:
.space 256
.section .rdata,"dr"
.align 16
.LC1:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC2:
.long 16
.long 16
.long 16
.long 16
.align 16
.LC3:
.long 4
.long 4
.long 4
.long 4
.align 16
.LC4:
.long 8
.long 8
.long 8
.long 8
.align 16
.LC5:
.long 12
.long 12
.long 12
.long 12
.align 16
.LC6:
.long 1431655765
.long 1431655765
.long 1431655765
.long 1431655765
.align 16
.LC7:
.long 858993459
.long 858993459
.long 858993459
.long 858993459
.align 16
.LC8:
.long 252645135
.long 252645135
.long 252645135
.long 252645135
.align 16
.LC9:
.word 255
.word 255
.word 255
.word 255
.word 255
.word 255
.word 255
.word 255
.align 16
.LC13:
.long 16711935
.long 16711935
.long 16711935
.long 16711935
.align 16
.LC14:
.long 65535
.long 65535
.long 65535
.long 65535
.ident "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 5.3.0"
.def __mingw_vprintf; .scl 2; .type 32; .endef
.def srand; .scl 2; .type 32; .endef
.def _assert; .scl 2; .type 32; .endef
.def clock; .scl 2; .type 32; .endef #asd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment