SeijiEmery · May 13, 2017 17:14
diff --git a/tetris_transform_point_algorithm.asm b/tetris_transform_point_algorithm.asm


 ; Transform an x/y point stored in rax w/ a rotation index (0-3) and origin x/y point in rdx.
 proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt)
 section .data
    ; Rotation table: transpose mask + sign-flip mask for x coord.
    sign_mask: dd 0x0, 0x0, 0x7000, 0x7000
    ; tr_masks:  dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000
    tr_masks:  dq 0x00000000, 0xffffffff

 section .text
    ; Save rbx
    push rbx

    ; Clamp rotation index to [0, 3]
    and rcx, 3

    ; maybe flip x sign
    xor  eax, [sign_mask + rcx * 4]

    ; maybe transpose x / y
    
    ; let rbx = rax transposed (flipped x / y)
    push rbx            ; save rbx
    mov rbx, rax
    shl rbx, 32
    or  ebx, eax

 ; TO COMBINE, EITHER: (dunno which; use one or the other)
 ; USE cmov branch (branch may be costly)
    and rcx, 1
    cmovnz rax, rbx

 ; OR use memory + masks (more instructions)
    ; mask rax by tr_masks[ecx & 1]
    and rcx, 1
    and rax, [tr_masks + rcx * 8]

    ; mask rbx by tr_masks[(ecx+1) & 1]
    inc rcx
    and rcx, 1
    and rbx, [tr_masks + rcx * 8]

    ; combine rax, rbx
    or  rax, rbx
 ; END
    ; add p0 to pt
    add eax, edx            ; add x component
    and rdx, 0xffff0000     ; mask y only in rdx
    add rdx, rax            ; add y in rdx
    add rax, rdx            ; add back

    ; restore rbx; clear rdx / rcx
    pop rbx
    xor rdx, rdx            
    xor rcx, rcx

    ret


 ;
 ; OPTIMIZATION 1:
 ;

 ; Transform an x/y point stored in rax w/ a rotation index (0-3) and 
 proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt)
 section .data
    ; Rotation table: transpose mask + sign-flip mask for x coord.
    sign_mask: dd 0x0, 0x0, 0x7000, 0x7000
    ; tr_masks:  dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000
    tr_masks:  dq 0x00000000, 0xffffffff

 section .text
    ; Save rbx
    push rbx

    ; push sign + transform masks
    mov [rsp],      qword 0xffffffff    ; utility mask: [rsp+0]: all bits set   (64 bits)
    mov [rsp+8],    qword 0x00000000    ;               [rsp+8]: all bits clear (64 bits)
    mov [rsp+16],   dword 0x7000        ; overlapping sign mask:
                                        ;   [rsp+12]: clear mask (32 bits)
                                        ;   [rsp+16]: sign bit   (32 bits)

    ; flip x sign if rotation_index & 2 != 0
    mov rbx, rcx
    and rbx, 2
    xor eax, dword [rsp+12 + rbx * 4]

    ; apply x / y transpose if rotation_index & 1 != 0
    
    ; let rbx = rax transposed (flipped x / y)
    push rbx
    mov rbx, rax
    shl rbx, 32
    or  ebx, eax

 ; TO COMBINE, EITHER: (dunno which; use one or the other)
 ; USE cmov branch (branch may be costly)
    and rcx, 1
    cmovnz rax, rbx

 ; OR use memory + masks (more instructions)
    ; mask rax by tr_masks[ecx & 1]
    and rcx, 1
    and rax, qword [rsp + rcx * 8]

    ; mask rbx by tr_masks[(ecx+1) & 1]
    inc rcx
    and rcx, 1
    and rbx, qword [rsp + rcx * 8]

    ; combine rax, rbx
    or  rax, rbx
 ; END
    ; add p0 to pt
    ; Note: this is more complicated than just 'add rax, rdx', which would *mostly* work.
    ; The one problem is the edge case where high x / y values 

    add eax, edx            ; add x component

    ; To add y component:
    and rdx, 0xffff0000     ; mask to y only; prevents potential overflow from bits [0-32)
    add rdx, rax            ; add y component in rdx
    and rdx, 0xffff0000     ; mask to remove x component in rdx, and y component in eax
    and rax, 0xffff
    add rax, rdx            ; merge components; safe, since x / y values now orthogonal




    and rdx, 0xffff0000     ; mask y only in rdx
    add rdx, rax            ; add y components into rdx; rdx = y only
    and rdx, 0xffff0000     ; mask to ensure y only
    and rax, 0xffff         ; mask rax = x only
    add rax, rdx            ; merge x / y components.

    ; restore rbx; clear rdx / rcx
    pop rbx
    xor rdx, rdx            
    xor rcx, rcx

    ret

 ;
 ; OPTIMIZATION 2: refactor to an array processing algorithm, where we now process points in bulk.
 ;

 proc transform_points (
    in  ptr rsi 32_bit_xy_input_point_array,
    out ptr rdi 32_bit_xy_output_point_array,
    const rcx array_size,
    const rdx p0,
    const rax rotation_index
 ) {
    ; save const registers + rbx
    mov [rsp],      rcx     ; initial array size
    mov [rsp+8],    rdx     ; origin point (p0)
    mov [rsp+16],   rax     ; rotation index ([0,3]; can be any value but we only use the lower bits)
    mov [rsp+24],   rbx     ; saved value for rbx

    ; store a dword sign mask at [rsp+32]: mask = (rax & 2 != 0) ? 0x7000 : 0x0
    ; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation
    and rax, 2
    mov [rsp+32], qword 0x7000
    mov rax, dword [rsp+32 + rax * 2]     ; rax = 0 | 2 => rax * 2 = 0 | 4
    mov [rsp+32], rax                     ; => [rsp + 32] (0x0000) iff rax & 2 != 0, else [rsp + 36] (0x7000)

    ; store a our transpose selector at dword [rsp+36]. This controls whether we transpose our x / y values
    ; or not, and has the following values:
    ;   rotation_index & 1 == 0  =>  0
    ;   rotation_index & 1 != 0  =>  4
    mov rax, [rsp+16]
    and rax, 1
    shl rax, 2
    mov [rsp+36], eax

 .process_points:
    sub rcx, 1
    jl  .end

    ; load point from array
    mov rax, [rsi + rcx * 8]

    ; Rotate our point in two steps; accomplished via selective transpose + sign flips:
    ; rotation value:   0       1       2       3
    ;   transpose?      no      yes     no      yes
    ;   flip sign?      no      no      yes     yes
    ; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90)
    ;
    ; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0),
    ; which should hopefully explain our logic above.
    
    ; selectively flip sign bit of our x component (lower 32 bits) using the sign mask
    xor eax, [rsp+32]

    ; Store this point, and its transpose, on the stack:
    ;   dword [rsp+40]: xxxx
    ;   dword [rsp+44]: yyyy
    ;   dword [rsp+48]: xxxx
    ;
    ;   qword [rsp+40] == (x, y),
    ;   qword [rsp+44] == (y, x)
    ;
    mov [rsp+40], rax
    mov [rsp+48], rax
    xor rax, rax

    ; Select using our transpose selector
    mov eax, [rsp+36]           ; load selector
    mov rax, [rsp+40 + eax]     ; load value | transposed value

    ; Finish transform by adding our origin point, and store result in output array

    ; Add + store x component
    add eax, [rsp+8]
    mov [rdi + rcx * 8], eax

    ; Add + store y component
    shr rax, 32
    add eax, [rsp+12]
    mov [rdi + rcx * 8 + 4], eax

    ; Repeat for all array elements
    jmp .process_points
 .end:
    ; restore const registers
    mov rcx, [rsp]
    mov rdx, [rsp+8]
    mov rax, [rsp+16]

    ret
 }

 ;
 ; Optimization 3: reduce memory ops by storing masks in unused registers (we were only using 4!)
 ; Still plenty of stack memory use b/c it's efficient (L1), branchless, and makes transposing points much easier.
 ;

 ; Transforms an array of points, applying a simple 90-degree rotation + offset from a fixed point
 ;   Inputs are 32-bit signed integers in a packed array as (x, y) values. Values are in LOCAL space.
 ;   Offset is a 32-bit (x, y) pair in WORLD space, packed into a 64-bit register.
 ;   All addressing is little-endian.
 ;
 ;   Rotation is an integer on [0,4) representing 90-degree increments; values wrap.
 ;   degrees = 90 * rotation (0 => 0 degrees, 1 => 90 degrees, 2 => 180 degrees, etc.)
 ;
 proc transform_points (
    in  ptr rsi 32_bit_xy_input_point_array,
    out ptr rdi 32_bit_xy_output_point_array,
    const rcx array_size,
    const rdx p0,
    const rax rotation_index
 ) {
    ; save const registers + rbx
    mov [rsp],      rcx     ; initial array size
    mov [rsp+8],    rdx     ; origin point (p0)
    mov [rsp+16],   rax     ; rotation index ([0,3]; can be any value but we only use the lower bits)
    mov [rsp+24],   rbx     ; saved value for rbx

    ;
    ; Pre calculations:
    ;

    ; store a dword sign mask in rdx: mask = (rax & 2 != 0) ? 0x7000 : 0x0
    ; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation
    and rax, 2
    mov [rsp+32], qword 0x7000
    mov rdx, dword [rsp+32 + rax * 2]     ; rax = 0 | 2 => rax * 2 = 0 | 4
                                          ; => rdx = (0x0000) iff rax & 2 != 0, else rdx = (0x7000)

    ; store a our transpose selector in rbx. This controls whether we transpose our x / y values
    ; or not, and has the following values:
    ;   rotation_index & 1 == 0  =>  0  => [rsp+32] (see below)
    ;   rotation_index & 1 != 0  =>  4  => [rsp+36]
    mov rbx, [rsp+16]
    and rbx, 1          ; rbx = 0 | 1
    shl rbx, 2          ; rbx = 0 | 4
    add rbx, 32         ; rbx = 32 | 36   (manual memory addressing; see below)

    ;
    ; Processing loop
    ;
 .process_points:
    sub rcx, 1
    jl  .end

    ; load point from array
    mov rax, [rsi + rcx * 8]

    ; Rotate our point in two steps; accomplished via selective transpose + sign flips:
    ; rotation value:   0       1       2       3
    ;   transpose?      no      yes     no      yes
    ;   flip sign?      no      no      yes     yes
    ; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90)
    ;
    ; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0),
    ; which should hopefully explain our logic above.
    
    ; selectively flip sign bit of our x component (lower 32 bits) using the sign mask
    xor eax, rdx

    ; Store this point, and its transpose, on the stack:
    ;   dword [rsp+32]: xxxx
    ;   dword [rsp+36]: yyyy
    ;   dword [rsp+40]: xxxx
    ;
    ;   qword [rsp+32] == (x, y),
    ;   qword [rsp+36] == (y, x)
    ;
    mov [rsp+32], rax
    mov [rsp+40], rax
    xor rax, rax

    ; Select using our transpose selector
    mov rax, [rsp + rbx]

    ; Finish transform by adding our origin point, and store result in output array

    ; Add + store x component
    add eax, [rsp+8]
    mov [rdi + rcx * 8], eax

    ; Add + store y component
    shr rax, 32
    add eax, [rsp+12]
    mov [rdi + rcx * 8 + 4], eax

    ; Repeat for all array elements
    jmp .process_points
 .end:
    ; restore const registers
    mov rcx, [rsp]
    mov rdx, [rsp+8]
    mov rax, [rsp+16]
    mov rbx, [rsp+24]

    ret
 }


	; Transform an x/y point stored in rax w/ a rotation index (0-3) and origin x/y point in rdx.
	proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt)
	section .data
	; Rotation table: transpose mask + sign-flip mask for x coord.
	sign_mask: dd 0x0, 0x0, 0x7000, 0x7000
	; tr_masks: dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000
	tr_masks: dq 0x00000000, 0xffffffff

	section .text
	; Save rbx
	push rbx

	; Clamp rotation index to [0, 3]
	and rcx, 3

	; maybe flip x sign
	xor eax, [sign_mask + rcx * 4]

	; maybe transpose x / y

	; let rbx = rax transposed (flipped x / y)
	push rbx ; save rbx
	mov rbx, rax
	shl rbx, 32
	or ebx, eax

	; TO COMBINE, EITHER: (dunno which; use one or the other)
	; USE cmov branch (branch may be costly)
	and rcx, 1
	cmovnz rax, rbx

	; OR use memory + masks (more instructions)
	; mask rax by tr_masks[ecx & 1]
	and rcx, 1
	and rax, [tr_masks + rcx * 8]

	; mask rbx by tr_masks[(ecx+1) & 1]
	inc rcx
	and rcx, 1
	and rbx, [tr_masks + rcx * 8]

	; combine rax, rbx
	or rax, rbx
	; END
	; add p0 to pt
	add eax, edx ; add x component
	and rdx, 0xffff0000 ; mask y only in rdx
	add rdx, rax ; add y in rdx
	add rax, rdx ; add back

	; restore rbx; clear rdx / rcx
	pop rbx
	xor rdx, rdx
	xor rcx, rcx

	ret


	;
	; OPTIMIZATION 1:
	;

	; Transform an x/y point stored in rax w/ a rotation index (0-3) and
	proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt)
	section .data
	; Rotation table: transpose mask + sign-flip mask for x coord.
	sign_mask: dd 0x0, 0x0, 0x7000, 0x7000
	; tr_masks: dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000
	tr_masks: dq 0x00000000, 0xffffffff

	section .text
	; Save rbx
	push rbx

	; push sign + transform masks
	mov [rsp], qword 0xffffffff ; utility mask: [rsp+0]: all bits set (64 bits)
	mov [rsp+8], qword 0x00000000 ; [rsp+8]: all bits clear (64 bits)
	mov [rsp+16], dword 0x7000 ; overlapping sign mask:
	; [rsp+12]: clear mask (32 bits)
	; [rsp+16]: sign bit (32 bits)

	; flip x sign if rotation_index & 2 != 0
	mov rbx, rcx
	and rbx, 2
	xor eax, dword [rsp+12 + rbx * 4]

	; apply x / y transpose if rotation_index & 1 != 0

	; let rbx = rax transposed (flipped x / y)
	push rbx
	mov rbx, rax
	shl rbx, 32
	or ebx, eax

	; TO COMBINE, EITHER: (dunno which; use one or the other)
	; USE cmov branch (branch may be costly)
	and rcx, 1
	cmovnz rax, rbx

	; OR use memory + masks (more instructions)
	; mask rax by tr_masks[ecx & 1]
	and rcx, 1
	and rax, qword [rsp + rcx * 8]

	; mask rbx by tr_masks[(ecx+1) & 1]
	inc rcx
	and rcx, 1
	and rbx, qword [rsp + rcx * 8]

	; combine rax, rbx
	or rax, rbx
	; END
	; add p0 to pt
	; Note: this is more complicated than just 'add rax, rdx', which would mostly work.
	; The one problem is the edge case where high x / y values

	add eax, edx ; add x component

	; To add y component:
	and rdx, 0xffff0000 ; mask to y only; prevents potential overflow from bits [0-32)
	add rdx, rax ; add y component in rdx
	and rdx, 0xffff0000 ; mask to remove x component in rdx, and y component in eax
	and rax, 0xffff
	add rax, rdx ; merge components; safe, since x / y values now orthogonal




	and rdx, 0xffff0000 ; mask y only in rdx
	add rdx, rax ; add y components into rdx; rdx = y only
	and rdx, 0xffff0000 ; mask to ensure y only
	and rax, 0xffff ; mask rax = x only
	add rax, rdx ; merge x / y components.

	; restore rbx; clear rdx / rcx
	pop rbx
	xor rdx, rdx
	xor rcx, rcx

	ret

	;
	; OPTIMIZATION 2: refactor to an array processing algorithm, where we now process points in bulk.
	;

	proc transform_points (
	in ptr rsi 32_bit_xy_input_point_array,
	out ptr rdi 32_bit_xy_output_point_array,
	const rcx array_size,
	const rdx p0,
	const rax rotation_index
	) {
	; save const registers + rbx
	mov [rsp], rcx ; initial array size
	mov [rsp+8], rdx ; origin point (p0)
	mov [rsp+16], rax ; rotation index ([0,3]; can be any value but we only use the lower bits)
	mov [rsp+24], rbx ; saved value for rbx

	; store a dword sign mask at [rsp+32]: mask = (rax & 2 != 0) ? 0x7000 : 0x0
	; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation
	and rax, 2
	mov [rsp+32], qword 0x7000
	mov rax, dword [rsp+32 + rax * 2] ; rax = 0 \| 2 => rax * 2 = 0 \| 4
	mov [rsp+32], rax ; => [rsp + 32] (0x0000) iff rax & 2 != 0, else [rsp + 36] (0x7000)

	; store a our transpose selector at dword [rsp+36]. This controls whether we transpose our x / y values
	; or not, and has the following values:
	; rotation_index & 1 == 0 => 0
	; rotation_index & 1 != 0 => 4
	mov rax, [rsp+16]
	and rax, 1
	shl rax, 2
	mov [rsp+36], eax

	.process_points:
	sub rcx, 1
	jl .end

	; load point from array
	mov rax, [rsi + rcx * 8]

	; Rotate our point in two steps; accomplished via selective transpose + sign flips:
	; rotation value: 0 1 2 3
	; transpose? no yes no yes
	; flip sign? no no yes yes
	; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90)
	;
	; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0),
	; which should hopefully explain our logic above.

	; selectively flip sign bit of our x component (lower 32 bits) using the sign mask
	xor eax, [rsp+32]

	; Store this point, and its transpose, on the stack:
	; dword [rsp+40]: xxxx
	; dword [rsp+44]: yyyy
	; dword [rsp+48]: xxxx
	;
	; qword [rsp+40] == (x, y),
	; qword [rsp+44] == (y, x)
	;
	mov [rsp+40], rax
	mov [rsp+48], rax
	xor rax, rax

	; Select using our transpose selector
	mov eax, [rsp+36] ; load selector
	mov rax, [rsp+40 + eax] ; load value \| transposed value

	; Finish transform by adding our origin point, and store result in output array

	; Add + store x component
	add eax, [rsp+8]
	mov [rdi + rcx * 8], eax

	; Add + store y component
	shr rax, 32
	add eax, [rsp+12]
	mov [rdi + rcx * 8 + 4], eax

	; Repeat for all array elements
	jmp .process_points
	.end:
	; restore const registers
	mov rcx, [rsp]
	mov rdx, [rsp+8]
	mov rax, [rsp+16]

	ret
	}

	;
	; Optimization 3: reduce memory ops by storing masks in unused registers (we were only using 4!)
	; Still plenty of stack memory use b/c it's efficient (L1), branchless, and makes transposing points much easier.
	;

	; Transforms an array of points, applying a simple 90-degree rotation + offset from a fixed point
	; Inputs are 32-bit signed integers in a packed array as (x, y) values. Values are in LOCAL space.
	; Offset is a 32-bit (x, y) pair in WORLD space, packed into a 64-bit register.
	; All addressing is little-endian.
	;
	; Rotation is an integer on [0,4) representing 90-degree increments; values wrap.
	; degrees = 90 * rotation (0 => 0 degrees, 1 => 90 degrees, 2 => 180 degrees, etc.)
	;
	proc transform_points (
	in ptr rsi 32_bit_xy_input_point_array,
	out ptr rdi 32_bit_xy_output_point_array,
	const rcx array_size,
	const rdx p0,
	const rax rotation_index
	) {
	; save const registers + rbx
	mov [rsp], rcx ; initial array size
	mov [rsp+8], rdx ; origin point (p0)
	mov [rsp+16], rax ; rotation index ([0,3]; can be any value but we only use the lower bits)
	mov [rsp+24], rbx ; saved value for rbx

	;
	; Pre calculations:
	;

	; store a dword sign mask in rdx: mask = (rax & 2 != 0) ? 0x7000 : 0x0
	; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation
	and rax, 2
	mov [rsp+32], qword 0x7000
	mov rdx, dword [rsp+32 + rax * 2] ; rax = 0 \| 2 => rax * 2 = 0 \| 4
	; => rdx = (0x0000) iff rax & 2 != 0, else rdx = (0x7000)

	; store a our transpose selector in rbx. This controls whether we transpose our x / y values
	; or not, and has the following values:
	; rotation_index & 1 == 0 => 0 => [rsp+32] (see below)
	; rotation_index & 1 != 0 => 4 => [rsp+36]
	mov rbx, [rsp+16]
	and rbx, 1 ; rbx = 0 \| 1
	shl rbx, 2 ; rbx = 0 \| 4
	add rbx, 32 ; rbx = 32 \| 36 (manual memory addressing; see below)

	;
	; Processing loop
	;
	.process_points:
	sub rcx, 1
	jl .end

	; load point from array
	mov rax, [rsi + rcx * 8]

	; Rotate our point in two steps; accomplished via selective transpose + sign flips:
	; rotation value: 0 1 2 3
	; transpose? no yes no yes
	; flip sign? no no yes yes
	; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90)
	;
	; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0),
	; which should hopefully explain our logic above.

	; selectively flip sign bit of our x component (lower 32 bits) using the sign mask
	xor eax, rdx

	; Store this point, and its transpose, on the stack:
	; dword [rsp+32]: xxxx
	; dword [rsp+36]: yyyy
	; dword [rsp+40]: xxxx
	;
	; qword [rsp+32] == (x, y),
	; qword [rsp+36] == (y, x)
	;
	mov [rsp+32], rax
	mov [rsp+40], rax
	xor rax, rax

	; Select using our transpose selector
	mov rax, [rsp + rbx]

	; Finish transform by adding our origin point, and store result in output array

	; Add + store x component
	add eax, [rsp+8]
	mov [rdi + rcx * 8], eax

	; Add + store y component
	shr rax, 32
	add eax, [rsp+12]
	mov [rdi + rcx * 8 + 4], eax

	; Repeat for all array elements
	jmp .process_points
	.end:
	; restore const registers
	mov rcx, [rsp]
	mov rdx, [rsp+8]
	mov rax, [rsp+16]
	mov rbx, [rsp+24]

	ret
	}