Last active
May 13, 2017 17:14
-
-
Save SeijiEmery/c8d98d9f3a3030da016ad1554802ae8a to your computer and use it in GitHub Desktop.
Simple algorithm for translating a point in assembly for a tetris clone / any 90-degree-rotation based game. Not tested, but has several optimization passes (should work). This is what happens when I get bored, lol.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ; Transform an x/y point stored in rax w/ a rotation index (0-3) and origin x/y point in rdx. | |
| proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt) | |
| section .data | |
| ; Rotation table: transpose mask + sign-flip mask for x coord. | |
| sign_mask: dd 0x0, 0x0, 0x7000, 0x7000 | |
| ; tr_masks: dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000 | |
| tr_masks: dq 0x00000000, 0xffffffff | |
| section .text | |
| ; Save rbx | |
| push rbx | |
| ; Clamp rotation index to [0, 3] | |
| and rcx, 3 | |
| ; maybe flip x sign | |
| xor eax, [sign_mask + rcx * 4] | |
| ; maybe transpose x / y | |
| ; let rbx = rax transposed (flipped x / y) | |
| push rbx ; save rbx | |
| mov rbx, rax | |
| shl rbx, 32 | |
| or ebx, eax | |
| ; TO COMBINE, EITHER: (dunno which; use one or the other) | |
| ; USE cmov branch (branch may be costly) | |
| and rcx, 1 | |
| cmovnz rax, rbx | |
| ; OR use memory + masks (more instructions) | |
| ; mask rax by tr_masks[ecx & 1] | |
| and rcx, 1 | |
| and rax, [tr_masks + rcx * 8] | |
| ; mask rbx by tr_masks[(ecx+1) & 1] | |
| inc rcx | |
| and rcx, 1 | |
| and rbx, [tr_masks + rcx * 8] | |
| ; combine rax, rbx | |
| or rax, rbx | |
| ; END | |
| ; add p0 to pt | |
| add eax, edx ; add x component | |
| and rdx, 0xffff0000 ; mask y only in rdx | |
| add rdx, rax ; add y in rdx | |
| add rax, rdx ; add back | |
| ; restore rbx; clear rdx / rcx | |
| pop rbx | |
| xor rdx, rdx | |
| xor rcx, rcx | |
| ret | |
| ; | |
| ; OPTIMIZATION 1: | |
| ; | |
| ; Transform an x/y point stored in rax w/ a rotation index (0-3) and | |
| proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt) | |
| section .data | |
| ; Rotation table: transpose mask + sign-flip mask for x coord. | |
| sign_mask: dd 0x0, 0x0, 0x7000, 0x7000 | |
| ; tr_masks: dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000 | |
| tr_masks: dq 0x00000000, 0xffffffff | |
| section .text | |
| ; Save rbx | |
| push rbx | |
| ; push sign + transform masks | |
| mov [rsp], qword 0xffffffff ; utility mask: [rsp+0]: all bits set (64 bits) | |
| mov [rsp+8], qword 0x00000000 ; [rsp+8]: all bits clear (64 bits) | |
| mov [rsp+16], dword 0x7000 ; overlapping sign mask: | |
| ; [rsp+12]: clear mask (32 bits) | |
| ; [rsp+16]: sign bit (32 bits) | |
| ; flip x sign if rotation_index & 2 != 0 | |
| mov rbx, rcx | |
| and rbx, 2 | |
| xor eax, dword [rsp+12 + rbx * 4] | |
| ; apply x / y transpose if rotation_index & 1 != 0 | |
| ; let rbx = rax transposed (flipped x / y) | |
| push rbx | |
| mov rbx, rax | |
| shl rbx, 32 | |
| or ebx, eax | |
| ; TO COMBINE, EITHER: (dunno which; use one or the other) | |
| ; USE cmov branch (branch may be costly) | |
| and rcx, 1 | |
| cmovnz rax, rbx | |
| ; OR use memory + masks (more instructions) | |
| ; mask rax by tr_masks[ecx & 1] | |
| and rcx, 1 | |
| and rax, qword [rsp + rcx * 8] | |
| ; mask rbx by tr_masks[(ecx+1) & 1] | |
| inc rcx | |
| and rcx, 1 | |
| and rbx, qword [rsp + rcx * 8] | |
| ; combine rax, rbx | |
| or rax, rbx | |
| ; END | |
| ; add p0 to pt | |
| ; Note: this is more complicated than just 'add rax, rdx', which would *mostly* work. | |
| ; The one problem is the edge case where high x / y values | |
| add eax, edx ; add x component | |
| ; To add y component: | |
| and rdx, 0xffff0000 ; mask to y only; prevents potential overflow from bits [0-32) | |
| add rdx, rax ; add y component in rdx | |
| and rdx, 0xffff0000 ; mask to remove x component in rdx, and y component in eax | |
| and rax, 0xffff | |
| add rax, rdx ; merge components; safe, since x / y values now orthogonal | |
| and rdx, 0xffff0000 ; mask y only in rdx | |
| add rdx, rax ; add y components into rdx; rdx = y only | |
| and rdx, 0xffff0000 ; mask to ensure y only | |
| and rax, 0xffff ; mask rax = x only | |
| add rax, rdx ; merge x / y components. | |
| ; restore rbx; clear rdx / rcx | |
| pop rbx | |
| xor rdx, rdx | |
| xor rcx, rcx | |
| ret | |
| ; | |
| ; OPTIMIZATION 2: refactor to an array processing algorithm, where we now process points in bulk. | |
| ; | |
| proc transform_points ( | |
| in ptr rsi 32_bit_xy_input_point_array, | |
| out ptr rdi 32_bit_xy_output_point_array, | |
| const rcx array_size, | |
| const rdx p0, | |
| const rax rotation_index | |
| ) { | |
| ; save const registers + rbx | |
| mov [rsp], rcx ; initial array size | |
| mov [rsp+8], rdx ; origin point (p0) | |
| mov [rsp+16], rax ; rotation index ([0,3]; can be any value but we only use the lower bits) | |
| mov [rsp+24], rbx ; saved value for rbx | |
| ; store a dword sign mask at [rsp+32]: mask = (rax & 2 != 0) ? 0x7000 : 0x0 | |
| ; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation | |
| and rax, 2 | |
| mov [rsp+32], qword 0x7000 | |
| mov rax, dword [rsp+32 + rax * 2] ; rax = 0 | 2 => rax * 2 = 0 | 4 | |
| mov [rsp+32], rax ; => [rsp + 32] (0x0000) iff rax & 2 != 0, else [rsp + 36] (0x7000) | |
| ; store a our transpose selector at dword [rsp+36]. This controls whether we transpose our x / y values | |
| ; or not, and has the following values: | |
| ; rotation_index & 1 == 0 => 0 | |
| ; rotation_index & 1 != 0 => 4 | |
| mov rax, [rsp+16] | |
| and rax, 1 | |
| shl rax, 2 | |
| mov [rsp+36], eax | |
| .process_points: | |
| sub rcx, 1 | |
| jl .end | |
| ; load point from array | |
| mov rax, [rsi + rcx * 8] | |
| ; Rotate our point in two steps; accomplished via selective transpose + sign flips: | |
| ; rotation value: 0 1 2 3 | |
| ; transpose? no yes no yes | |
| ; flip sign? no no yes yes | |
| ; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90) | |
| ; | |
| ; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0), | |
| ; which should hopefully explain our logic above. | |
| ; selectively flip sign bit of our x component (lower 32 bits) using the sign mask | |
| xor eax, [rsp+32] | |
| ; Store this point, and its transpose, on the stack: | |
| ; dword [rsp+40]: xxxx | |
| ; dword [rsp+44]: yyyy | |
| ; dword [rsp+48]: xxxx | |
| ; | |
| ; qword [rsp+40] == (x, y), | |
| ; qword [rsp+44] == (y, x) | |
| ; | |
| mov [rsp+40], rax | |
| mov [rsp+48], rax | |
| xor rax, rax | |
| ; Select using our transpose selector | |
| mov eax, [rsp+36] ; load selector | |
| mov rax, [rsp+40 + eax] ; load value | transposed value | |
| ; Finish transform by adding our origin point, and store result in output array | |
| ; Add + store x component | |
| add eax, [rsp+8] | |
| mov [rdi + rcx * 8], eax | |
| ; Add + store y component | |
| shr rax, 32 | |
| add eax, [rsp+12] | |
| mov [rdi + rcx * 8 + 4], eax | |
| ; Repeat for all array elements | |
| jmp .process_points | |
| .end: | |
| ; restore const registers | |
| mov rcx, [rsp] | |
| mov rdx, [rsp+8] | |
| mov rax, [rsp+16] | |
| ret | |
| } | |
| ; | |
| ; Optimization 3: reduce memory ops by storing masks in unused registers (we were only using 4!) | |
| ; Still plenty of stack memory use b/c it's efficient (L1), branchless, and makes transposing points much easier. | |
| ; | |
| ; Transforms an array of points, applying a simple 90-degree rotation + offset from a fixed point | |
| ; Inputs are 32-bit signed integers in a packed array as (x, y) values. Values are in LOCAL space. | |
| ; Offset is a 32-bit (x, y) pair in WORLD space, packed into a 64-bit register. | |
| ; All addressing is little-endian. | |
| ; | |
| ; Rotation is an integer on [0,4) representing 90-degree increments; values wrap. | |
| ; degrees = 90 * rotation (0 => 0 degrees, 1 => 90 degrees, 2 => 180 degrees, etc.) | |
| ; | |
| proc transform_points ( | |
| in ptr rsi 32_bit_xy_input_point_array, | |
| out ptr rdi 32_bit_xy_output_point_array, | |
| const rcx array_size, | |
| const rdx p0, | |
| const rax rotation_index | |
| ) { | |
| ; save const registers + rbx | |
| mov [rsp], rcx ; initial array size | |
| mov [rsp+8], rdx ; origin point (p0) | |
| mov [rsp+16], rax ; rotation index ([0,3]; can be any value but we only use the lower bits) | |
| mov [rsp+24], rbx ; saved value for rbx | |
| ; | |
| ; Pre calculations: | |
| ; | |
| ; store a dword sign mask in rdx: mask = (rax & 2 != 0) ? 0x7000 : 0x0 | |
| ; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation | |
| and rax, 2 | |
| mov [rsp+32], qword 0x7000 | |
| mov rdx, dword [rsp+32 + rax * 2] ; rax = 0 | 2 => rax * 2 = 0 | 4 | |
| ; => rdx = (0x0000) iff rax & 2 != 0, else rdx = (0x7000) | |
| ; store a our transpose selector in rbx. This controls whether we transpose our x / y values | |
| ; or not, and has the following values: | |
| ; rotation_index & 1 == 0 => 0 => [rsp+32] (see below) | |
| ; rotation_index & 1 != 0 => 4 => [rsp+36] | |
| mov rbx, [rsp+16] | |
| and rbx, 1 ; rbx = 0 | 1 | |
| shl rbx, 2 ; rbx = 0 | 4 | |
| add rbx, 32 ; rbx = 32 | 36 (manual memory addressing; see below) | |
| ; | |
| ; Processing loop | |
| ; | |
| .process_points: | |
| sub rcx, 1 | |
| jl .end | |
| ; load point from array | |
| mov rax, [rsi + rcx * 8] | |
| ; Rotate our point in two steps; accomplished via selective transpose + sign flips: | |
| ; rotation value: 0 1 2 3 | |
| ; transpose? no yes no yes | |
| ; flip sign? no no yes yes | |
| ; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90) | |
| ; | |
| ; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0), | |
| ; which should hopefully explain our logic above. | |
| ; selectively flip sign bit of our x component (lower 32 bits) using the sign mask | |
| xor eax, rdx | |
| ; Store this point, and its transpose, on the stack: | |
| ; dword [rsp+32]: xxxx | |
| ; dword [rsp+36]: yyyy | |
| ; dword [rsp+40]: xxxx | |
| ; | |
| ; qword [rsp+32] == (x, y), | |
| ; qword [rsp+36] == (y, x) | |
| ; | |
| mov [rsp+32], rax | |
| mov [rsp+40], rax | |
| xor rax, rax | |
| ; Select using our transpose selector | |
| mov rax, [rsp + rbx] | |
| ; Finish transform by adding our origin point, and store result in output array | |
| ; Add + store x component | |
| add eax, [rsp+8] | |
| mov [rdi + rcx * 8], eax | |
| ; Add + store y component | |
| shr rax, 32 | |
| add eax, [rsp+12] | |
| mov [rdi + rcx * 8 + 4], eax | |
| ; Repeat for all array elements | |
| jmp .process_points | |
| .end: | |
| ; restore const registers | |
| mov rcx, [rsp] | |
| mov rdx, [rsp+8] | |
| mov rax, [rsp+16] | |
| mov rbx, [rsp+24] | |
| ret | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment