Skip to content

Instantly share code, notes, and snippets.

@3outeille
Last active September 6, 2022 20:39
Show Gist options
  • Save 3outeille/6259ec56963347e0f4696110e04f0549 to your computer and use it in GitHub Desktop.
Save 3outeille/6259ec56963347e0f4696110e04f0549 to your computer and use it in GitHub Desktop.
x86inc fg blend stripe sse4
%define ARCH_X86_64 1
%define private_prefix asm
%include "x86inc.asm"
section .rodata align=16
zeros_vec: dd 0, 0, 0, 0
ones_vec: dd 1, 1, 1, 1
neg_ones_vec: dd -1, -1, -1, -1
section .data align=16
mask: dd 1, 1, 1, 1
section .bss align=16
not_mask: resd 4 ; reserve 4 dword (128 bits)
section .text
INIT_XMM sse4
cglobal fg_blend_stripe, 6, 6, 7, dstSampleOffsetY, srcSampleOffsetY, grainStripe, widthComp, blockHeight, bitDepth, row, col
; declares a function that:
.begin_global:
; Clear registers
pxor m0, m0
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
pxor m5, m5
pxor m6, m6
pxor m7, m7
; Prepare SIMD SSE4 ov_clip_uintp2: mask & not_mask
; for loop
push r3q
mov r3q, bitDepthq
; Restore after each call values of mask
mov DWORD [mask], 1
mov DWORD [mask + 4], 1
mov DWORD [mask + 8], 1
mov DWORD [mask + 12], 1
shl DWORD [mask], r3b
shl DWORD [mask + 4], r3b
shl DWORD [mask + 8], r3b
shl DWORD [mask + 12], r3b
mov r3w, [mask]
mov [not_mask], r3q
xor [not_mask], r3q
sub [not_mask], r3q
mov r3q, 1
sub [mask], r3q
mov r3w, [mask + 4]
mov [not_mask + 4], r3q
xor [not_mask + 4], r3q
sub [not_mask + 4], r3q
mov r3q, 1
sub [mask + 4], r3q
mov r3w, [mask + 8]
mov [not_mask + 8], r3q
xor [not_mask + 8], r3q
sub [not_mask + 8], r3q
mov r3q, 1
sub [mask + 8], r3q
mov r3w, [mask + 12]
mov [not_mask + 12], r3q
xor [not_mask + 12], r3q
sub [not_mask + 12], r3q
mov r3q, 1
sub [mask + 12], r3q
pop r3q
push r11q
push r9q
; left shift
mov r11q, bitDepthq
sub r11q, 8
push r11q ; make sure it is on 16 byte boundary
push r6q
push r10q
mov r10q, rsp
sub r10q, 96 ; Make space for row (8 bytes), col (8 bytes), double push r9q for alignment (16 bytes), 16 chunks of int32_t vals (64 bytes)
.core_global:
mov QWORD [rsp-32], 0 ; row
.L1:
; 1 iteration = 4 rows
cmp [rsp-32], blockHeightq ; row < blockHeight
jge .end_global
mov QWORD [rsp-24], 0 ; col
mov r9q, 0
; row offset: We need to skip (#row * widthComp * 4 bytes)
add r9q, [rsp-32]
imul r9q, widthCompq
imul r9q, 2 ; int32_t = 4 bytes (we later multiply by 2 to make it 4)
.L2_begin:
cmp [rsp-24], widthCompq ; col < widthComp
jge .L1_end
; Push twice to keep 16 bytes alignment
push r9q
push r9q
; 2 registers = total load of 8 values on the same row
; row 1
; get the first 4 chunks of int32_t
mova m0, [grainStripeq + r9q * 2] ; multiply by the remaining 2 to make it 4
mova m1, [grainStripeq + r9q * 2 + 16]
; Left shift
pslld m0, [rsp + 32]
pslld m1, [rsp + 32]
; add srcSampleOffsetY
; packed move with sign extension
pmovsxwd m2, [srcSampleOffsetYq + r9q]
pmovsxwd m3, [srcSampleOffsetYq + r9q + 8]
paddd m0, m2
paddd m1, m3
; SIMD SSE4 ov_clip_uintp2
; Set to 0 all negative values.
; grainSample = _mm_max_epi32(grainSample, _mm_setzero_si128());
pmaxsd m0, [zeros_vec]
pmaxsd m1, [zeros_vec]
; Save grainSample
movdqu [rbp], m0
movdqu [rbp + 16], m1
; int32_t overflow = !!(val & (~mask));
; __m128i overflow = _mm_and_si128(grainSample, not_mask);
pand m0, [not_mask]
pand m1, [not_mask]
; overflow = _mm_min_epi32(overflow, _mm_set1_epi32(1));
pminsd m0, [ones_vec]
pminsd m1, [ones_vec]
; overflow = _mm_sub_epi32(_mm_set1_epi32(0), overflow);
pmulld m0, [neg_ones_vec]
pmulld m1, [neg_ones_vec]
; ((-overflow) & mask) | (val & mask);
; __m128i lhs = _mm_and_si128(overflow, mask);
pand m0, [mask]
pand m1, [mask]
; Save lhs
movdqu [rbp + 32], m0
movdqu [rbp + 48], m1
; Retrieve grainSample
movdqu m0, [rbp]
movdqu m1, [rbp + 16]
;__m128i rhs = _mm_and_si128(grainSample, mask);
pand m0, [mask]
pand m1, [mask]
; __m128i clipped_val = _mm_or_si128(lhs, rhs);
; rhs OR lhs
por m0, [rbp + 32]
por m1, [rbp + 48]
; Update dstSampleOffsetY
pextrd eax, m0, 0
mov WORD [dstSampleOffsetYq + r9q + 0], ax
pextrd eax, m0, 1
mov WORD [dstSampleOffsetYq + r9q + 2], ax
pextrd eax, m0, 2
mov WORD [dstSampleOffsetYq + r9q + 4], ax
pextrd eax, m0, 3
mov WORD [dstSampleOffsetYq + r9q + 6], ax
pextrd eax, m1, 0
mov WORD [dstSampleOffsetYq + r9q + 8], ax
pextrd eax, m1, 1
mov WORD [dstSampleOffsetYq + r9q + 10], ax
pextrd eax, m1, 2
mov WORD [dstSampleOffsetYq + r9q + 12], ax
pextrd eax, m1, 3
mov WORD [dstSampleOffsetYq + r9q + 14], ax
; Repeat above block code for row2, row3, row4
; ...
; Prepare next iteration
pop r9q
pop r9q
add QWORD [rsp-24], 8
add r9q, 16
jmp .L2_begin
.L1_end:
add QWORD [rsp-32], 4
jmp .L1
.end_global:
add r10q, 96
mov rsp, r10q
pop r10q
pop r6q
pop r11q
pop r9q
pop r11q
RET
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment