Created
October 11, 2013 17:17
-
-
Save oblitum/6938559 to your computer and use it in GitHub Desktop.
yeppp! a trivial test's clang intel syntax assembly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
call _ZNSt6chrono3_V212system_clock3nowEv | |
.Ltmp100: | |
mov r14, rax | |
.Ltmp101: | |
#DEBUG_VALUE: i <- 0 | |
.align 16, 0x90 | |
.LBB0_6: # %.lr.ph.i.preheader | |
# =>This Loop Header: Depth=1 | |
# Child Loop BB0_7 Depth 2 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
xorps xmm0, xmm0 | |
xor eax, eax | |
.align 16, 0x90 | |
.LBB0_7: # %.lr.ph.i | |
# Parent Loop BB0_6 Depth=1 | |
# => This Inner Loop Header: Depth=2 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
.loc 23 183 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:183:0 | |
.Ltmp102: | |
movsd xmm1, qword ptr [r15 + rax] | |
mulsd xmm1, qword ptr [rbx + rax] | |
addsd xmm0, xmm1 | |
.Ltmp103: | |
#DEBUG_VALUE: inner_product<__gnu_cxx::__normal_iterator<double *, std::vector<double, std::allocator<double> > >, __gnu_cxx::__normal_iterator<double *, std::vector<double, std::allocator<double> > >, double>:__init <- XMM0 | |
#DEBUG_VALUE: r <- [XMM0+0] | |
.loc 21 825 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_iterator.h:825:0 | |
add rax, 8 | |
cmp rax, 8192 | |
.Ltmp104: | |
.loc 23 182 14 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:182:14 | |
jne .LBB0_7 | |
.Ltmp105: | |
# BB#8: # %_ZSt13inner_productIN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEES6_dET1_T_S8_T0_S7_.exit | |
# in Loop: Header=BB0_6 Depth=1 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [XMM0+0] | |
.loc 1 25 13 # basics.cpp:25:13 | |
movsd qword ptr [rsp + 24], xmm0 | |
.loc 1 24 0 # basics.cpp:24:0 | |
inc ebp | |
.Ltmp106: | |
#DEBUG_VALUE: i <- EBP | |
cmp ebp, 1000000 | |
jne .LBB0_6 | |
.Ltmp107: | |
# BB#9: | |
#DEBUG_VALUE: size <- 1024 | |
.loc 1 26 0 # basics.cpp:26:0 | |
call _ZNSt6chrono3_V212system_clock3nowEv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
call _ZNSt6chrono3_V212system_clock3nowEv | |
.Ltmp100: | |
mov r14, rax | |
.Ltmp101: | |
#DEBUG_VALUE: i <- 0 | |
.align 16, 0x90 | |
.LBB0_6: # %.lr.ph.i.preheader | |
# =>This Loop Header: Depth=1 | |
# Child Loop BB0_7 Depth 2 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
xorpd xmm0, xmm0 | |
xor eax, eax | |
.align 16, 0x90 | |
.LBB0_7: # %vector.body | |
# Parent Loop BB0_6 Depth=1 | |
# => This Inner Loop Header: Depth=2 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
.loc 23 183 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:183:0 | |
.Ltmp102: | |
movupd xmm1, xmmword ptr [rbx + 8*rax] | |
movupd xmm2, xmmword ptr [r15 + 8*rax] | |
mulpd xmm2, xmm1 | |
addpd xmm0, xmm2 | |
add rax, 2 | |
cmp rax, 1024 | |
jne .LBB0_7 | |
# BB#8: # %middle.block | |
# in Loop: Header=BB0_6 Depth=1 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
movapd xmm1, xmm0 | |
unpckhpd xmm1, xmm1 # xmm1 = xmm1[1,1] | |
addpd xmm1, xmm0 | |
.Ltmp103: | |
.loc 1 25 13 # basics.cpp:25:13 | |
movlpd qword ptr [rsp + 24], xmm1 | |
.loc 1 24 0 # basics.cpp:24:0 | |
inc ebp | |
.Ltmp104: | |
#DEBUG_VALUE: i <- EBP | |
cmp ebp, 1000000 | |
jne .LBB0_6 | |
.Ltmp105: | |
# BB#9: | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
.loc 1 26 0 # basics.cpp:26:0 | |
call _ZNSt6chrono3_V212system_clock3nowEv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
call _ZNSt6chrono3_V212system_clock3nowEv | |
.Ltmp100: | |
mov r14, rax | |
.Ltmp101: | |
#DEBUG_VALUE: i <- 0 | |
.align 16, 0x90 | |
.LBB0_6: # %.lr.ph.i.preheader | |
# =>This Loop Header: Depth=1 | |
# Child Loop BB0_7 Depth 2 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
vxorpd ymm0, ymm0, ymm0 | |
xor eax, eax | |
.align 16, 0x90 | |
.LBB0_7: # %vector.body | |
# Parent Loop BB0_6 Depth=1 | |
# => This Inner Loop Header: Depth=2 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
.loc 23 183 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:183:0 | |
.Ltmp102: | |
vmovupd xmm1, xmmword ptr [rbx + 8*rax + 16] | |
vmovupd xmm2, xmmword ptr [rbx + 8*rax] | |
vinsertf128 ymm1, ymm2, xmm1, 1 | |
vmovupd xmm2, xmmword ptr [r15 + 8*rax + 16] | |
vmovupd xmm3, xmmword ptr [r15 + 8*rax] | |
vinsertf128 ymm2, ymm3, xmm2, 1 | |
vmulpd ymm1, ymm2, ymm1 | |
vaddpd ymm0, ymm0, ymm1 | |
add rax, 4 | |
cmp rax, 1024 | |
jne .LBB0_7 | |
# BB#8: # %middle.block | |
# in Loop: Header=BB0_6 Depth=1 | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
vextractf128 xmm1, ymm0, 1 | |
vaddpd ymm0, ymm0, ymm1 | |
vhaddpd ymm0, ymm0, ymm0 | |
.Ltmp103: | |
.loc 1 25 13 # basics.cpp:25:13 | |
vmovlpd qword ptr [rsp + 24], xmm0 | |
.loc 1 24 0 # basics.cpp:24:0 | |
inc ebp | |
.Ltmp104: | |
#DEBUG_VALUE: i <- EBP | |
cmp ebp, 1000000 | |
jne .LBB0_6 | |
.Ltmp105: | |
# BB#9: | |
#DEBUG_VALUE: size <- 1024 | |
#DEBUG_VALUE: r <- [RSP+24] | |
.loc 1 26 0 # basics.cpp:26:0 | |
vzeroupper | |
call _ZNSt6chrono3_V212system_clock3nowEv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%ifidn __OUTPUT_FORMAT__, elf64 | |
section .text.SandyBridge progbits alloc exec nowrite align=16 | |
global _yepCore_DotProduct_V64fV64f_S64f_SandyBridge | |
_yepCore_DotProduct_V64fV64f_S64f_SandyBridge: | |
%else | |
section .text | |
global __yepCore_DotProduct_V64fV64f_S64f_SandyBridge | |
__yepCore_DotProduct_V64fV64f_S64f_SandyBridge: | |
%endif | |
.ENTRY: | |
TEST rdi, rdi | |
JZ .return_null_pointer | |
TEST rdi, 7 | |
JNZ .return_misaligned_pointer | |
TEST rsi, rsi | |
JZ .return_null_pointer | |
TEST rsi, 7 | |
JNZ .return_misaligned_pointer | |
TEST rdx, rdx | |
JZ .return_null_pointer | |
TEST rdx, 7 | |
JNZ .return_misaligned_pointer | |
VXORPD xmm15, xmm15, xmm15 | |
TEST rcx, rcx | |
JZ .return_ok | |
VXORPD xmm7, xmm7, xmm7 | |
VXORPD xmm6, xmm6, xmm6 | |
VXORPD xmm5, xmm5, xmm5 | |
VXORPD xmm4, xmm4, xmm4 | |
VXORPD xmm3, xmm3, xmm3 | |
VXORPD xmm2, xmm2, xmm2 | |
VXORPD xmm1, xmm1, xmm1 | |
TEST rsi, 31 | |
JZ .source_y_32b_aligned | |
.source_y_32b_misaligned: | |
VMOVSD xmm0, [rdi] | |
VMULSD xmm0, xmm0, [rsi] | |
VADDPD ymm15, ymm15, ymm0 | |
ADD rdi, 8 | |
ADD rsi, 8 | |
SUB rcx, 1 | |
JZ .reduce_batch | |
TEST rsi, 31 | |
JNZ .source_y_32b_misaligned | |
.source_y_32b_aligned: | |
SUB rcx, 32 | |
JB .batch_process_finish | |
.process_batch_prologue: | |
VMOVUPD ymm0, [rdi] | |
VMOVUPD ymm8, [byte rdi + 32] | |
VMOVUPD ymm9, [byte rdi + 64] | |
VMULPD ymm0, ymm0, [rsi] | |
VMOVUPD ymm12, [byte rdi + 96] | |
VMULPD ymm8, ymm8, [byte rsi + 32] | |
VMOVUPD ymm14, [dword rdi + 128] | |
VMULPD ymm9, ymm9, [byte rsi + 64] | |
VMOVUPD ymm10, [dword rdi + 160] | |
VMULPD ymm12, ymm12, [byte rsi + 96] | |
VADDPD ymm15, ymm15, ymm0 | |
VMOVUPD ymm11, [dword rdi + 192] | |
VMULPD ymm14, ymm14, [dword rsi + 128] | |
VADDPD ymm7, ymm7, ymm8 | |
VMOVUPD ymm13, [dword rdi + 224] | |
VMULPD ymm10, ymm10, [dword rsi + 160] | |
VADDPD ymm6, ymm6, ymm9 | |
ADD rdi, 256 | |
VMULPD ymm11, ymm11, [dword rsi + 192] | |
VADDPD ymm5, ymm5, ymm12 | |
SUB rcx, 32 | |
JB .process_batch_epilogue | |
align 16 | |
.process_batch: | |
VMOVUPD ymm0, [rdi] | |
VMULPD ymm13, ymm13, [dword rsi + 224] | |
VADDPD ymm4, ymm4, ymm14 | |
VMOVUPD ymm8, [byte rdi + 32] | |
ADD rsi, 256 | |
VADDPD ymm3, ymm3, ymm10 | |
VMOVUPD ymm9, [byte rdi + 64] | |
VMULPD ymm0, ymm0, [rsi] | |
VADDPD ymm2, ymm2, ymm11 | |
VMOVUPD ymm12, [byte rdi + 96] | |
VMULPD ymm8, ymm8, [byte rsi + 32] | |
VADDPD ymm1, ymm1, ymm13 | |
VMOVUPD ymm14, [dword rdi + 128] | |
VMULPD ymm9, ymm9, [byte rsi + 64] | |
VMOVUPD ymm10, [dword rdi + 160] | |
VMULPD ymm12, ymm12, [byte rsi + 96] | |
VADDPD ymm15, ymm15, ymm0 | |
VMOVUPD ymm11, [dword rdi + 192] | |
VMULPD ymm14, ymm14, [dword rsi + 128] | |
VADDPD ymm7, ymm7, ymm8 | |
VMOVUPD ymm13, [dword rdi + 224] | |
VMULPD ymm10, ymm10, [dword rsi + 160] | |
VADDPD ymm6, ymm6, ymm9 | |
ADD rdi, 256 | |
VMULPD ymm11, ymm11, [dword rsi + 192] | |
VADDPD ymm5, ymm5, ymm12 | |
SUB rcx, 32 | |
JAE .process_batch | |
.process_batch_epilogue: | |
VMULPD ymm13, ymm13, [dword rsi + 224] | |
VADDPD ymm4, ymm4, ymm14 | |
ADD rsi, 256 | |
VADDPD ymm3, ymm3, ymm10 | |
VADDPD ymm2, ymm2, ymm11 | |
VADDPD ymm1, ymm1, ymm13 | |
.batch_process_finish: | |
ADD rcx, 32 | |
JZ .reduce_batch | |
.process_single: | |
VMOVSD xmm8, [rdi] | |
VMULSD xmm8, xmm8, [rsi] | |
VADDPD ymm15, ymm15, ymm8 | |
ADD rdi, 8 | |
ADD rsi, 8 | |
SUB rcx, 1 | |
JNZ .process_single | |
.reduce_batch: | |
VADDPD ymm15, ymm15, ymm7 | |
VADDPD ymm6, ymm6, ymm5 | |
VADDPD ymm4, ymm4, ymm3 | |
VADDPD ymm2, ymm2, ymm1 | |
VADDPD ymm15, ymm15, ymm6 | |
VADDPD ymm4, ymm4, ymm2 | |
VADDPD ymm15, ymm15, ymm4 | |
VEXTRACTF128 xmm8, ymm15, 1 | |
VADDPD xmm15, xmm15, xmm8 | |
VUNPCKHPD xmm8, xmm15, xmm15 | |
VADDSD xmm15, xmm15, xmm8 | |
.return_ok: | |
VMOVSD [rdx], xmm15 | |
XOR eax, eax | |
.return: | |
VZEROUPPER | |
RET | |
.return_null_pointer: | |
MOV eax, 1 | |
JMP .return | |
.return_misaligned_pointer: | |
MOV eax, 2 | |
JMP .return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment