Skip to content

Instantly share code, notes, and snippets.

@udoprog
Created December 7, 2022 13:34
Show Gist options
  • Save udoprog/ec85576b1d0baa120b068feb92536ab5 to your computer and use it in GitHub Desktop.
Save udoprog/ec85576b1d0baa120b068feb92536ab5 to your computer and use it in GitHub Desktop.
--- a.txt 2022-12-07 14:31:31.870270000 +0100
+++ b.txt 2022-12-07 14:31:58.517728400 +0100
@@ -16,34 +16,40 @@
.zero 1
.zero 1
.LCPI0_1:
- .long 2147483648
- .long 2147483648
- .long 2147483648
- .long 2147483648
-example::diff:
- xor eax, eax
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+.LCPI0_2:
+ .long 1
+ .long 1
+ .long 1
+ .long 1
+example::diff2:
+ xor ecx, ecx
test rsi, rsi
je .LBB0_12
- mov r10, rdi
+ mov rax, rdi
cmp rsi, 8
jb .LBB0_10
- mov r10, rsi
- and r10, -8
- lea rax, [r10 - 8]
- mov r8, rax
+ mov rax, rsi
+ and rax, -8
+ lea rcx, [rax - 8]
+ mov r8, rcx
shr r8, 3
inc r8
- test rax, rax
+ test rcx, rcx
je .LBB0_3
mov r9, r8
and r9, -2
- pxor xmm10, xmm10
+ pxor xmm2, xmm2
xor ecx, ecx
pcmpeqd xmm8, xmm8
movdqa xmm9, xmmword ptr [rip + .LCPI0_0]
- movdqa xmm4, xmmword ptr [rip + .LCPI0_1]
- xorps xmm0, xmm0
- xorps xmm1, xmm1
+ movdqa xmm5, xmmword ptr [rip + .LCPI0_1]
+ movdqa xmm6, xmmword ptr [rip + .LCPI0_2]
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
.LBB0_5:
movd xmm7, dword ptr [rdi + rcx]
movd xmm3, dword ptr [rdi + rcx + 4]
@@ -51,185 +57,143 @@
paddb xmm3, xmm8
pand xmm7, xmm9
pand xmm3, xmm9
- punpcklbw xmm7, xmm10
- punpcklwd xmm7, xmm10
- punpcklbw xmm3, xmm10
- punpcklwd xmm3, xmm10
- pshuflw xmm6, xmm7, 254
- movdqa xmm5, xmm4
- psrld xmm5, xmm6
- pshuflw xmm2, xmm7, 84
- movdqa xmm6, xmm4
- psrld xmm6, xmm2
- punpcklqdq xmm6, xmm5
- pshufd xmm2, xmm7, 238
- pshuflw xmm5, xmm2, 254
- movdqa xmm7, xmm4
- psrld xmm7, xmm5
- pshuflw xmm2, xmm2, 84
- movdqa xmm5, xmm4
- psrld xmm5, xmm2
- punpckhqdq xmm5, xmm7
- shufps xmm6, xmm5, 204
- pshuflw xmm2, xmm3, 254
- movdqa xmm5, xmm4
- psrld xmm5, xmm2
- pshuflw xmm2, xmm3, 84
- movdqa xmm7, xmm4
- psrld xmm7, xmm2
- orps xmm6, xmm0
- punpcklqdq xmm7, xmm5
- pshufd xmm0, xmm3, 238
- pshuflw xmm2, xmm0, 254
- movdqa xmm3, xmm4
- psrld xmm3, xmm2
- pshuflw xmm0, xmm0, 84
- movdqa xmm2, xmm4
- psrld xmm2, xmm0
- punpckhqdq xmm2, xmm3
- shufps xmm7, xmm2, 204
- orps xmm7, xmm1
- movd xmm1, dword ptr [rdi + rcx + 8]
- movd xmm3, dword ptr [rdi + rcx + 12]
+ punpcklbw xmm7, xmm2
+ punpcklwd xmm7, xmm2
+ punpcklbw xmm3, xmm2
+ punpcklwd xmm3, xmm2
+ pslld xmm7, 23
+ paddd xmm7, xmm5
+ cvttps2dq xmm7, xmm7
+ pshufd xmm4, xmm7, 245
+ pmuludq xmm7, xmm6
+ pshufd xmm7, xmm7, 232
+ pmuludq xmm4, xmm6
+ pshufd xmm4, xmm4, 232
+ punpckldq xmm7, xmm4
+ por xmm7, xmm0
+ pslld xmm3, 23
+ paddd xmm3, xmm5
+ cvttps2dq xmm0, xmm3
+ pshufd xmm4, xmm0, 245
+ pmuludq xmm0, xmm6
+ pshufd xmm3, xmm0, 232
+ pmuludq xmm4, xmm6
+ pshufd xmm0, xmm4, 232
+ punpckldq xmm3, xmm0
+ por xmm3, xmm1
+ movd xmm0, dword ptr [rdi + rcx + 8]
+ movd xmm1, dword ptr [rdi + rcx + 12]
+ paddb xmm0, xmm8
paddb xmm1, xmm8
- paddb xmm3, xmm8
+ pand xmm0, xmm9
pand xmm1, xmm9
- pand xmm3, xmm9
- punpcklbw xmm1, xmm10
- punpcklwd xmm1, xmm10
- punpcklbw xmm3, xmm10
- punpcklwd xmm3, xmm10
- pshuflw xmm0, xmm1, 254
- movdqa xmm2, xmm4
- psrld xmm2, xmm0
- pshuflw xmm5, xmm1, 84
- movdqa xmm0, xmm4
- psrld xmm0, xmm5
- punpcklqdq xmm0, xmm2
- pshufd xmm1, xmm1, 238
- pshuflw xmm2, xmm1, 254
- movdqa xmm5, xmm4
- psrld xmm5, xmm2
- pshuflw xmm1, xmm1, 84
- movdqa xmm2, xmm4
- psrld xmm2, xmm1
- punpckhqdq xmm2, xmm5
- shufps xmm0, xmm2, 204
- pshuflw xmm1, xmm3, 254
- movdqa xmm2, xmm4
- psrld xmm2, xmm1
- pshuflw xmm5, xmm3, 84
- movdqa xmm1, xmm4
- psrld xmm1, xmm5
- orps xmm0, xmm6
- punpcklqdq xmm1, xmm2
- pshufd xmm2, xmm3, 238
- pshuflw xmm3, xmm2, 254
- movdqa xmm5, xmm4
- psrld xmm5, xmm3
- pshuflw xmm2, xmm2, 84
- movdqa xmm3, xmm4
- psrld xmm3, xmm2
- punpckhqdq xmm3, xmm5
- shufps xmm1, xmm3, 204
- orps xmm1, xmm7
+ punpcklbw xmm0, xmm2
+ punpcklwd xmm0, xmm2
+ punpcklbw xmm1, xmm2
+ punpcklwd xmm1, xmm2
+ pslld xmm0, 23
+ paddd xmm0, xmm5
+ cvttps2dq xmm0, xmm0
+ pshufd xmm4, xmm0, 245
+ pmuludq xmm0, xmm6
+ pshufd xmm0, xmm0, 232
+ pmuludq xmm4, xmm6
+ pshufd xmm4, xmm4, 232
+ punpckldq xmm0, xmm4
+ por xmm0, xmm7
+ pslld xmm1, 23
+ paddd xmm1, xmm5
+ cvttps2dq xmm1, xmm1
+ pshufd xmm4, xmm1, 245
+ pmuludq xmm1, xmm6
+ pshufd xmm1, xmm1, 232
+ pmuludq xmm4, xmm6
+ pshufd xmm4, xmm4, 232
+ punpckldq xmm1, xmm4
+ por xmm1, xmm3
add rcx, 16
add r9, -2
jne .LBB0_5
test r8b, 1
je .LBB0_8
.LBB0_7:
- movd xmm4, dword ptr [rdi + rcx]
+ movd xmm3, dword ptr [rdi + rcx]
movd xmm2, dword ptr [rdi + rcx + 4]
- pcmpeqd xmm3, xmm3
- paddb xmm4, xmm3
+ pcmpeqd xmm4, xmm4
+ paddb xmm3, xmm4
movdqa xmm5, xmmword ptr [rip + .LCPI0_0]
- paddb xmm2, xmm3
- pand xmm4, xmm5
+ paddb xmm2, xmm4
+ pand xmm3, xmm5
pand xmm2, xmm5
- pxor xmm3, xmm3
- punpcklbw xmm4, xmm3
- punpcklwd xmm4, xmm3
- punpcklbw xmm2, xmm3
- punpcklwd xmm2, xmm3
- pshuflw xmm5, xmm4, 254
- movdqa xmm3, xmmword ptr [rip + .LCPI0_1]
- movdqa xmm6, xmm3
- psrld xmm6, xmm5
- pshuflw xmm5, xmm4, 84
- movdqa xmm7, xmm3
- psrld xmm7, xmm5
- pshufd xmm4, xmm4, 238
- pshuflw xmm8, xmm4, 254
- movdqa xmm5, xmm3
- psrld xmm5, xmm8
- pshuflw xmm8, xmm4, 84
- movdqa xmm4, xmm3
- psrld xmm4, xmm8
- punpcklqdq xmm7, xmm6
- punpckhqdq xmm4, xmm5
- shufps xmm7, xmm4, 204
- pshuflw xmm4, xmm2, 254
- movdqa xmm5, xmm3
- psrld xmm5, xmm4
- pshuflw xmm4, xmm2, 84
- movdqa xmm6, xmm3
- psrld xmm6, xmm4
- pshufd xmm2, xmm2, 238
- pshuflw xmm8, xmm2, 254
- movdqa xmm4, xmm3
- psrld xmm4, xmm8
- orps xmm0, xmm7
- pshuflw xmm2, xmm2, 84
- psrld xmm3, xmm2
- punpcklqdq xmm6, xmm5
- punpckhqdq xmm3, xmm4
- shufps xmm6, xmm3, 204
- orps xmm1, xmm6
+ pxor xmm4, xmm4
+ punpcklbw xmm3, xmm4
+ punpcklwd xmm3, xmm4
+ punpcklbw xmm2, xmm4
+ punpcklwd xmm2, xmm4
+ pslld xmm3, 23
+ movdqa xmm4, xmmword ptr [rip + .LCPI0_1]
+ paddd xmm3, xmm4
+ cvttps2dq xmm3, xmm3
+ movdqa xmm5, xmmword ptr [rip + .LCPI0_2]
+ pshufd xmm6, xmm3, 245
+ pmuludq xmm3, xmm5
+ pshufd xmm3, xmm3, 232
+ pmuludq xmm6, xmm5
+ pshufd xmm6, xmm6, 232
+ punpckldq xmm3, xmm6
+ por xmm0, xmm3
+ pslld xmm2, 23
+ paddd xmm2, xmm4
+ cvttps2dq xmm2, xmm2
+ pshufd xmm3, xmm2, 245
+ pmuludq xmm2, xmm5
+ pshufd xmm2, xmm2, 232
+ pmuludq xmm3, xmm5
+ pshufd xmm3, xmm3, 232
+ punpckldq xmm2, xmm3
+ por xmm1, xmm2
.LBB0_8:
- orps xmm0, xmm1
+ por xmm0, xmm1
pshufd xmm1, xmm0, 238
por xmm1, xmm0
pshufd xmm0, xmm1, 85
por xmm0, xmm1
- movd eax, xmm0
- cmp r10, rsi
+ movd ecx, xmm0
+ cmp rax, rsi
je .LBB0_12
- add r10, rdi
+ add rax, rdi
.LBB0_10:
add rdi, rsi
.LBB0_11:
- movzx ecx, byte ptr [r10]
- dec cl
- mov esi, -2147483648
- shr esi, cl
- inc r10
- or eax, esi
- cmp r10, rdi
+ movzx esi, byte ptr [rax]
+ inc rax
+ dec sil
+ bts ecx, esi
+ cmp rax, rdi
jne .LBB0_11
.LBB0_12:
- mov ecx, eax
- shr ecx
- and ecx, 1431655765
- sub eax, ecx
- mov ecx, eax
- and ecx, 858993459
- shr eax, 2
+ mov eax, ecx
+ shr eax
+ and eax, 1431655765
+ sub ecx, eax
+ mov eax, ecx
and eax, 858993459
- add eax, ecx
- mov ecx, eax
- shr ecx, 4
+ shr ecx, 2
+ and ecx, 858993459
add ecx, eax
- and ecx, 252645135
- imul eax, ecx, 16843009
+ mov eax, ecx
+ shr eax, 4
+ add eax, ecx
+ and eax, 252645135
+ imul eax, eax, 16843009
shr eax, 24
cmp eax, edx
sete al
ret
.LBB0_3:
- xorps xmm0, xmm0
+ pxor xmm0, xmm0
xor ecx, ecx
- xorps xmm1, xmm1
+ pxor xmm1, xmm1
test r8b, 1
jne .LBB0_7
jmp .LBB0_8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment