Skip to content

Instantly share code, notes, and snippets.

@SeijiEmery
Last active May 13, 2017 17:14
Show Gist options
  • Save SeijiEmery/c8d98d9f3a3030da016ad1554802ae8a to your computer and use it in GitHub Desktop.
Save SeijiEmery/c8d98d9f3a3030da016ad1554802ae8a to your computer and use it in GitHub Desktop.
Simple algorithm for translating a point in assembly for a tetris clone / any 90-degree-rotation based game. Not tested, but has several optimization passes (should work). This is what happens when I get bored, lol.
; Transform an x/y point stored in rax w/ a rotation index (0-3) and origin x/y point in rdx.
proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt)
section .data
; Rotation table: transpose mask + sign-flip mask for x coord.
sign_mask: dd 0x0, 0x0, 0x7000, 0x7000
; tr_masks: dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000
tr_masks: dq 0x00000000, 0xffffffff
section .text
; Save rbx
push rbx
; Clamp rotation index to [0, 3]
and rcx, 3
; maybe flip x sign
xor eax, [sign_mask + rcx * 4]
; maybe transpose x / y
; let rbx = rax transposed (flipped x / y)
push rbx ; save rbx
mov rbx, rax
shl rbx, 32
or ebx, eax
; TO COMBINE, EITHER: (dunno which; use one or the other)
; USE cmov branch (branch may be costly)
and rcx, 1
cmovnz rax, rbx
; OR use memory + masks (more instructions)
; mask rax by tr_masks[ecx & 1]
and rcx, 1
and rax, [tr_masks + rcx * 8]
; mask rbx by tr_masks[(ecx+1) & 1]
inc rcx
and rcx, 1
and rbx, [tr_masks + rcx * 8]
; combine rax, rbx
or rax, rbx
; END
; add p0 to pt
add eax, edx ; add x component
and rdx, 0xffff0000 ; mask y only in rdx
add rdx, rax ; add y in rdx
add rax, rdx ; add back
; restore rbx; clear rdx / rcx
pop rbx
xor rdx, rdx
xor rcx, rcx
ret
;
; OPTIMIZATION 1:
;
; Transform an x/y point stored in rax w/ a rotation index (0-3) and
proc transform_point (in rax pt, in rdx p0, in rcx rot_index) => (out rax pt)
section .data
; Rotation table: transpose mask + sign-flip mask for x coord.
sign_mask: dd 0x0, 0x0, 0x7000, 0x7000
; tr_masks: dq 0x0000ffff, 0xffff0000 ;, 0x0000ffff, 0xffff0000
tr_masks: dq 0x00000000, 0xffffffff
section .text
; Save rbx
push rbx
; push sign + transform masks
mov [rsp], qword 0xffffffff ; utility mask: [rsp+0]: all bits set (64 bits)
mov [rsp+8], qword 0x00000000 ; [rsp+8]: all bits clear (64 bits)
mov [rsp+16], dword 0x7000 ; overlapping sign mask:
; [rsp+12]: clear mask (32 bits)
; [rsp+16]: sign bit (32 bits)
; flip x sign if rotation_index & 2 != 0
mov rbx, rcx
and rbx, 2
xor eax, dword [rsp+12 + rbx * 4]
; apply x / y transpose if rotation_index & 1 != 0
; let rbx = rax transposed (flipped x / y)
push rbx
mov rbx, rax
shl rbx, 32
or ebx, eax
; TO COMBINE, EITHER: (dunno which; use one or the other)
; USE cmov branch (branch may be costly)
and rcx, 1
cmovnz rax, rbx
; OR use memory + masks (more instructions)
; mask rax by tr_masks[ecx & 1]
and rcx, 1
and rax, qword [rsp + rcx * 8]
; mask rbx by tr_masks[(ecx+1) & 1]
inc rcx
and rcx, 1
and rbx, qword [rsp + rcx * 8]
; combine rax, rbx
or rax, rbx
; END
; add p0 to pt
; Note: this is more complicated than just 'add rax, rdx', which would *mostly* work.
; The one problem is the edge case where high x / y values
add eax, edx ; add x component
; To add y component:
and rdx, 0xffff0000 ; mask to y only; prevents potential overflow from bits [0-32)
add rdx, rax ; add y component in rdx
and rdx, 0xffff0000 ; mask to remove x component in rdx, and y component in eax
and rax, 0xffff
add rax, rdx ; merge components; safe, since x / y values now orthogonal
and rdx, 0xffff0000 ; mask y only in rdx
add rdx, rax ; add y components into rdx; rdx = y only
and rdx, 0xffff0000 ; mask to ensure y only
and rax, 0xffff ; mask rax = x only
add rax, rdx ; merge x / y components.
; restore rbx; clear rdx / rcx
pop rbx
xor rdx, rdx
xor rcx, rcx
ret
;
; OPTIMIZATION 2: refactor to an array processing algorithm, where we now process points in bulk.
;
proc transform_points (
in ptr rsi 32_bit_xy_input_point_array,
out ptr rdi 32_bit_xy_output_point_array,
const rcx array_size,
const rdx p0,
const rax rotation_index
) {
; save const registers + rbx
mov [rsp], rcx ; initial array size
mov [rsp+8], rdx ; origin point (p0)
mov [rsp+16], rax ; rotation index ([0,3]; can be any value but we only use the lower bits)
mov [rsp+24], rbx ; saved value for rbx
; store a dword sign mask at [rsp+32]: mask = (rax & 2 != 0) ? 0x7000 : 0x0
; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation
and rax, 2
mov [rsp+32], qword 0x7000
mov rax, dword [rsp+32 + rax * 2] ; rax = 0 | 2 => rax * 2 = 0 | 4
mov [rsp+32], rax ; => [rsp + 32] (0x0000) iff rax & 2 != 0, else [rsp + 36] (0x7000)
; store a our transpose selector at dword [rsp+36]. This controls whether we transpose our x / y values
; or not, and has the following values:
; rotation_index & 1 == 0 => 0
; rotation_index & 1 != 0 => 4
mov rax, [rsp+16]
and rax, 1
shl rax, 2
mov [rsp+36], eax
.process_points:
sub rcx, 1
jl .end
; load point from array
mov rax, [rsi + rcx * 8]
; Rotate our point in two steps; accomplished via selective transpose + sign flips:
; rotation value: 0 1 2 3
; transpose? no yes no yes
; flip sign? no no yes yes
; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90)
;
; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0),
; which should hopefully explain our logic above.
; selectively flip sign bit of our x component (lower 32 bits) using the sign mask
xor eax, [rsp+32]
; Store this point, and its transpose, on the stack:
; dword [rsp+40]: xxxx
; dword [rsp+44]: yyyy
; dword [rsp+48]: xxxx
;
; qword [rsp+40] == (x, y),
; qword [rsp+44] == (y, x)
;
mov [rsp+40], rax
mov [rsp+48], rax
xor rax, rax
; Select using our transpose selector
mov eax, [rsp+36] ; load selector
mov rax, [rsp+40 + eax] ; load value | transposed value
; Finish transform by adding our origin point, and store result in output array
; Add + store x component
add eax, [rsp+8]
mov [rdi + rcx * 8], eax
; Add + store y component
shr rax, 32
add eax, [rsp+12]
mov [rdi + rcx * 8 + 4], eax
; Repeat for all array elements
jmp .process_points
.end:
; restore const registers
mov rcx, [rsp]
mov rdx, [rsp+8]
mov rax, [rsp+16]
ret
}
;
; Optimization 3: reduce memory ops by storing masks in unused registers (we were only using 4!)
; Still plenty of stack memory use b/c it's efficient (L1), branchless, and makes transposing points much easier.
;
; Transforms an array of points, applying a simple 90-degree rotation + offset from a fixed point
; Inputs are 32-bit signed integers in a packed array as (x, y) values. Values are in LOCAL space.
; Offset is a 32-bit (x, y) pair in WORLD space, packed into a 64-bit register.
; All addressing is little-endian.
;
; Rotation is an integer on [0,4) representing 90-degree increments; values wrap.
; degrees = 90 * rotation (0 => 0 degrees, 1 => 90 degrees, 2 => 180 degrees, etc.)
;
proc transform_points (
in ptr rsi 32_bit_xy_input_point_array,
out ptr rdi 32_bit_xy_output_point_array,
const rcx array_size,
const rdx p0,
const rax rotation_index
) {
; save const registers + rbx
mov [rsp], rcx ; initial array size
mov [rsp+8], rdx ; origin point (p0)
mov [rsp+16], rax ; rotation index ([0,3]; can be any value but we only use the lower bits)
mov [rsp+24], rbx ; saved value for rbx
;
; Pre calculations:
;
; store a dword sign mask in rdx: mask = (rax & 2 != 0) ? 0x7000 : 0x0
; this mask is used to selectively flip the sign of the x component; is pre-selected based on rotation
and rax, 2
mov [rsp+32], qword 0x7000
mov rdx, dword [rsp+32 + rax * 2] ; rax = 0 | 2 => rax * 2 = 0 | 4
; => rdx = (0x0000) iff rax & 2 != 0, else rdx = (0x7000)
; store a our transpose selector in rbx. This controls whether we transpose our x / y values
; or not, and has the following values:
; rotation_index & 1 == 0 => 0 => [rsp+32] (see below)
; rotation_index & 1 != 0 => 4 => [rsp+36]
mov rbx, [rsp+16]
and rbx, 1 ; rbx = 0 | 1
shl rbx, 2 ; rbx = 0 | 4
add rbx, 32 ; rbx = 32 | 36 (manual memory addressing; see below)
;
; Processing loop
;
.process_points:
sub rcx, 1
jl .end
; load point from array
mov rax, [rsi + rcx * 8]
; Rotate our point in two steps; accomplished via selective transpose + sign flips:
; rotation value: 0 1 2 3
; transpose? no yes no yes
; flip sign? no no yes yes
; This produces correct results for simple 90-degree rotations (where degrees = (rotation_index & 3) * 90)
;
; As you can see, transpose == (rotation & 1 != 0), sign_flip == (rotation & 2 != 0),
; which should hopefully explain our logic above.
; selectively flip sign bit of our x component (lower 32 bits) using the sign mask
xor eax, rdx
; Store this point, and its transpose, on the stack:
; dword [rsp+32]: xxxx
; dword [rsp+36]: yyyy
; dword [rsp+40]: xxxx
;
; qword [rsp+32] == (x, y),
; qword [rsp+36] == (y, x)
;
mov [rsp+32], rax
mov [rsp+40], rax
xor rax, rax
; Select using our transpose selector
mov rax, [rsp + rbx]
; Finish transform by adding our origin point, and store result in output array
; Add + store x component
add eax, [rsp+8]
mov [rdi + rcx * 8], eax
; Add + store y component
shr rax, 32
add eax, [rsp+12]
mov [rdi + rcx * 8 + 4], eax
; Repeat for all array elements
jmp .process_points
.end:
; restore const registers
mov rcx, [rsp]
mov rdx, [rsp+8]
mov rax, [rsp+16]
mov rbx, [rsp+24]
ret
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment