Skip to content

Instantly share code, notes, and snippets.

@SwapnilGaikwad
Created July 5, 2022 15:15
Show Gist options
  • Save SwapnilGaikwad/f9d5c3aea191562dd44600806e0e1e05 to your computer and use it in GitHub Desktop.
Save SwapnilGaikwad/f9d5c3aea191562dd44600806e0e1e05 to your computer and use it in GitHub Desktop.
; Assembly listing for method System.Text.Tests.AsciiUtilityTests:GetIndexOfFirstNonAsciiChar_Sse2(long,long):long
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 2 inlinees with PGO data; 9 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 34, 35.50) long -> rbx
; V01 arg1 [V01,T01] ( 19, 11 ) long -> rsi
;* V02 loc0 [V02,T08] ( 0, 0 ) int -> zero-ref single-def
;* V03 loc1 [V03,T09] ( 0, 0 ) int -> zero-ref single-def
; V04 loc2 [V04,T10] ( 11, 12.50) simd16 -> mm1
; V05 loc3 [V05,T11] ( 3, 8.50) simd16 -> mm2
; V06 loc4 [V06,T04] ( 6, 3 ) int -> rdi
; V07 loc5 [V07,T03] ( 8, 4 ) long -> r14 single-def
; V08 loc6 [V08,T12] ( 2, 4.50) simd16 -> mm0 single-def
;* V09 loc7 [V09,T13] ( 0, 0 ) simd16 -> zero-ref single-def
; V10 loc8 [V10,T06] ( 3, 1.50) int -> rdi
; V11 loc9 [V11,T02] ( 2, 4.50) long -> rax single-def
;* V12 loc10 [V12 ] ( 0, 0 ) simd16 -> zero-ref
; V13 loc11 [V13,T05] ( 4, 2 ) long -> rdi
;* V14 loc12 [V14 ] ( 0, 0 ) int -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) lclBlk ( 0) [rsp+00H] "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref "Inline return value spill temp"
;* V17 tmp2 [V17 ] ( 0, 0 ) int -> zero-ref "Inline return value spill temp"
;* V18 tmp3 [V18 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V19 tmp4 [V19 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
; V20 cse0 [V20,T07] ( 3, 1.50) long -> rdi "CSE - moderate"
;
; Lcl frame size = 0
G_M26287_IG01:
push rbp
push r14
push rbx
vzeroupper
lea rbp, [rsp+10H]
mov rbx, rdi
;; size=15 bbWeight=1 PerfScore 4.75
G_M26287_IG02:
test rsi, rsi
jne SHORT G_M26287_IG05
;; size=5 bbWeight=1 PerfScore 1.25
G_M26287_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M26287_IG04:
pop rbx
pop r14
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26287_IG05:
mov r14, rbx
cmp rsi, 8
jb G_M26287_IG16
vmovupd xmm0, xmmword ptr [reloc @RWD00]
vmovdqu xmm1, xmmword ptr [r14]
vpaddusw xmm1, xmm1, xmmword ptr [reloc @RWD16]
vpmovmskb edi, xmm1
test edi, 0xAAAA
jne G_M26287_IG14
add rsi, rsi
cmp rsi, 32
jb SHORT G_M26287_IG08
lea rbx, [r14+16]
and rbx, -16
add rsi, r14
sub rsi, rbx
cmp rsi, 32
jb SHORT G_M26287_IG07
lea rax, [rbx+rsi]
sub rax, 32
align [11 bytes for IG06]
;; size=101 bbWeight=0.50 PerfScore 9.38
G_M26287_IG06:
vmovdqa xmm1, xmmword ptr [rbx]
vmovdqa xmm2, xmmword ptr [rbx+16]
vpor xmm3, xmm1, xmm2
vptest xmm3, xmm0
jne SHORT G_M26287_IG12
add rbx, 32
cmp rbx, rax
jbe SHORT G_M26287_IG06
;; size=29 bbWeight=4 PerfScore 55.33
G_M26287_IG07:
test sil, 16
je SHORT G_M26287_IG09
vmovdqa xmm1, xmmword ptr [rbx]
vptest xmm1, xmmword ptr [reloc @RWD00]
jne SHORT G_M26287_IG13
;; size=21 bbWeight=0.50 PerfScore 5.62
G_M26287_IG08:
add rbx, 16
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M26287_IG09:
movzx rax, sil
test al, 15
je SHORT G_M26287_IG10
mov rax, rsi
and rax, 15
add rax, rbx
mov rbx, rax
sub rbx, 16
vmovdqu xmm1, xmmword ptr [rbx]
vptest xmm1, xmmword ptr [reloc @RWD00]
jne SHORT G_M26287_IG13
add rbx, 16
;; size=44 bbWeight=0.50 PerfScore 6.50
G_M26287_IG10:
mov rax, rbx
sub rax, r14
shr rax, 1
;; size=9 bbWeight=0.50 PerfScore 0.50
G_M26287_IG11:
pop rbx
pop r14
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26287_IG12:
vptest xmm1, xmmword ptr [reloc @RWD00]
jne SHORT G_M26287_IG13
add rbx, 16
vmovaps xmm1, xmm2
;; size=19 bbWeight=0.50 PerfScore 3.25
G_M26287_IG13:
vpaddusw xmm0, xmm1, xmmword ptr [reloc @RWD16]
vpmovmskb edi, xmm0
;; size=12 bbWeight=0.50 PerfScore 2.00
G_M26287_IG14:
and edi, 0xAAAA
xor esi, esi
tzcnt esi, edi
mov edi, esi
lea rbx, [rbx+rdi-1]
jmp SHORT G_M26287_IG10
;; size=21 bbWeight=0.50 PerfScore 2.88
G_M26287_IG15:
call [System.Text.Tests.AsciiUtilityTests:FirstCharInUInt32IsAscii(int):bool]
test eax, eax
je SHORT G_M26287_IG10
add rbx, 2
jmp SHORT G_M26287_IG10
;; size=16 bbWeight=0.50 PerfScore 3.25
G_M26287_IG16:
test sil, 4
je SHORT G_M26287_IG19
mov rdi, qword ptr [r14]
mov rax, 0xD1FFAB1E
and rdi, rax
je SHORT G_M26287_IG18
;; size=24 bbWeight=0.50 PerfScore 2.38
G_M26287_IG17:
xor esi, esi
tzcnt rsi, rdi
mov edi, esi
sar edi, 3
movsxd rbx, edi
and rbx, -2
add rbx, r14
jmp G_M26287_IG10
;; size=27 bbWeight=0.50 PerfScore 2.88
G_M26287_IG18:
lea rbx, [r14+8]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M26287_IG19:
test sil, 2
je SHORT G_M26287_IG20
mov edi, dword ptr [rbx]
test edi, 0xD1FFAB1E
jne SHORT G_M26287_IG15
add rbx, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M26287_IG20:
test sil, 1
je G_M26287_IG10
cmp word ptr [rbx], 127
ja G_M26287_IG10
add rbx, 2
jmp G_M26287_IG10
;; size=29 bbWeight=0.50 PerfScore 3.75
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dq 7F807F807F807F80h, 7F807F807F807F80h
; Total bytes of code 412, prolog size 15, PerfScore 151.08, instruction count 111, allocated bytes for code 420 (MethodHash=335d9950) for method System.Text.Tests.AsciiUtilityTests:GetIndexOfFirstNonAsciiChar_Sse2(long,long):long
; ============================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment