|
; Assembly listing for method BenchFind.BatchFinder:Find_AVX_256_Optimized(System.ReadOnlySpan`1[int],int):int |
|
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows |
|
; optimized code |
|
; rsp based frame |
|
; fully interruptible |
|
; No PGO data |
|
; 2 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data |
|
|
|
G_M000_IG01: ;; offset=0000H |
|
C5F877 vzeroupper |
|
|
|
G_M000_IG02: ;; offset=0003H |
|
488B01 mov rax, bword ptr [rcx] |
|
C4E1796EC2 vmovd xmm0, edx |
|
C4E27D58C0 vpbroadcastd ymm0, ymm0 |
|
8B4908 mov ecx, dword ptr [rcx+08H] |
|
4863C9 movsxd rcx, ecx |
|
4C8BC1 mov r8, rcx |
|
4983E0E0 and r8, -32 |
|
4533C9 xor r9d, r9d |
|
4D85C0 test r8, r8 |
|
7E45 jle SHORT G_M000_IG04 |
|
align [0 bytes for IG03] |
|
|
|
G_M000_IG03: ;; offset=0025H |
|
4D8BD1 mov r10, r9 |
|
49C1E202 shl r10, 2 |
|
C4A17D760C10 vpcmpeqd ymm1, ymm0, ymmword ptr[rax+r10] |
|
C4A17D76541020 vpcmpeqd ymm2, ymm0, ymmword ptr[rax+r10+20H] |
|
C4A17D765C1040 vpcmpeqd ymm3, ymm0, ymmword ptr[rax+r10+40H] |
|
C4A17D76641060 vpcmpeqd ymm4, ymm0, ymmword ptr[rax+r10+60H] |
|
C4E175EBEA vpor ymm5, ymm1, ymm2 |
|
C4E155EBEB vpor ymm5, ymm5, ymm3 |
|
C4E155EBEC vpor ymm5, ymm5, ymm4 |
|
C4E27D17ED vptest ymm5, ymm5 |
|
7571 jne SHORT G_M000_IG14 |
|
4983C120 add r9, 32 |
|
4D3BC8 cmp r9, r8 |
|
7CBB jl SHORT G_M000_IG03 |
|
|
|
G_M000_IG04: ;; offset=006AH |
|
4C8BC1 mov r8, rcx |
|
4983E0F8 and r8, -8 |
|
4D3BC8 cmp r9, r8 |
|
7D20 jge SHORT G_M000_IG06 |
|
66660F1F840000000000 align [10 bytes for IG05] |
|
|
|
G_M000_IG05: ;; offset=0080H |
|
C4A17D761C88 vpcmpeqd ymm3, ymm0, ymmword ptr[rax+4*r9] |
|
C4E27D17DB vptest ymm3, ymm3 |
|
7531 jne SHORT G_M000_IG12 |
|
4983C108 add r9, 8 |
|
4D3BC8 cmp r9, r8 |
|
7CEA jl SHORT G_M000_IG05 |
|
|
|
G_M000_IG06: ;; offset=0096H |
|
4C3BC9 cmp r9, rcx |
|
7D13 jge SHORT G_M000_IG08 |
|
0F1F440000 align [5 bytes for IG07] |
|
|
|
G_M000_IG07: ;; offset=00A0H |
|
42391488 cmp dword ptr [rax+4*r9], edx |
|
7411 je SHORT G_M000_IG10 |
|
49FFC1 inc r9 |
|
4C3BC9 cmp r9, rcx |
|
7CF2 jl SHORT G_M000_IG07 |
|
|
|
G_M000_IG08: ;; offset=00AEH |
|
B8FFFFFFFF mov eax, -1 |
|
|
|
G_M000_IG09: ;; offset=00B3H |
|
C5F877 vzeroupper |
|
C3 ret |
|
|
|
G_M000_IG10: ;; offset=00B7H |
|
418BC1 mov eax, r9d |
|
|
|
G_M000_IG11: ;; offset=00BAH |
|
C5F877 vzeroupper |
|
C3 ret |
|
|
|
G_M000_IG12: ;; offset=00BEH |
|
C5FC50C3 vmovmskps yrax, ymm3 |
|
F30FBCC0 tzcnt eax, eax |
|
4103C1 add eax, r9d |
|
|
|
G_M000_IG13: ;; offset=00C9H |
|
C5F877 vzeroupper |
|
C3 ret |
|
|
|
G_M000_IG14: ;; offset=00CDH |
|
C5E56BC4 vpackssdw ymm0, ymm3, ymm4 |
|
C4E3FD00C0D8 vpermq ymm0, ymm0, -40 |
|
C5F56BCA vpackssdw ymm1, ymm1, ymm2 |
|
C4E3FD00C9D8 vpermq ymm1, ymm1, -40 |
|
C5F563C0 vpacksswb ymm0, ymm1, ymm0 |
|
C4E3FD00C0D8 vpermq ymm0, ymm0, -40 |
|
C5FDD7C0 vpmovmskb eax, ymm0 |
|
F30FBCC0 tzcnt eax, eax |
|
4103C1 add eax, r9d |
|
|
|
G_M000_IG15: ;; offset=00F6H |
|
C5F877 vzeroupper |
|
C3 ret |
|
|
|
; Total bytes of code 250 |
Yep, indeed, I forgot to do that after changing the order of the pack and permute - before the ORs were less costly, but now it makes completely sense to change it 👍