Last active
December 31, 2022 02:24
-
-
Save bvibber/6642676a39ea642984ff477c649d4d37 to your computer and use it in GitHub Desktop.
imul16 for 6502
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Lightly tested work in progress; imul16 for 6502 | |
; two 16-bit inputs, one 32-bit output | |
; using the Atari floating point registers as argument placeholders | |
; ca65 syntax | |
; brion vibber, 2022 | |
; FP registers in zero page | |
FR0 = $d4 | |
FRE = $da | |
FR1 = $e0 | |
FR2 = $e6 | |
FRX = $ec | |
.code | |
.export start | |
; 2 + 8 * byte cycles | |
.macro neg bytes, arg | |
sec ; 2 cyc | |
.repeat bytes, byte ; 8 * byte cycles | |
lda #00 ; 2 cyc | |
sbc arg + byte ; 3 cyc | |
sta arg + byte ; 3 cyc | |
.endrepeat | |
.endmacro | |
; 18 cycles | |
.macro neg16 arg | |
neg 2, arg | |
.endmacro | |
; 34 cycles | |
.macro neg32 arg | |
neg 4, arg | |
.endmacro | |
; inner loop for imul16 | |
; 24 to 44 cycles | |
.macro bitmul16 arg1, arg2, result, bitnum | |
.local next | |
; does 16-bit adds | |
; arg1 must be 0 or positive | |
; arg2 must be 0 or positive | |
clc ; 2 cyc | |
; check if arg1 has 0 or 1 bit in this place | |
; 5 cycles either way | |
.if bitnum < 8 | |
lda arg1 ; 3 cyc | |
and #(1 << bitnum) ; 2 cyc | |
.else | |
lda arg1 + 1 ; 3 cyc | |
and #(1 << (bitnum - 8)) ; 2 cyc | |
.endif | |
beq next ; 2 cyc | |
; 16-bit add on the top bits | |
lda result + 2 ; 3 cyc | |
adc arg2 ; 3 cyc | |
sta result + 2 ; 3 cyc | |
lda result + 3 ; 3 cyc | |
adc arg2 + 1 ; 3 cyc | |
ror a ; 2 cyc - get a jump on the shift | |
sta result + 3 ; 3 cyc | |
; Shift the 32-bit result down by one bit, | |
; saving the previous carry. | |
ror result + 3 ; 5 cyc | |
next: | |
ror result + 2 ; 5 cyc | |
ror result + 1 ; 5 cyc | |
.if bitnum >= 8 | |
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte | |
; when it's all uninitialized data | |
ror result ; 5 cyc | |
.endif | |
.endmacro | |
; 5 to 25 cycles | |
.macro check_sign arg | |
; Check sign bit and flip argument to postive, | |
; keeping a count of sign bits in the X register. | |
.local positive | |
lda arg + 1 ; 3 cyc | |
bpl positive ; 2 cyc | |
neg16 arg ; 18 cyc | |
inx ; 2 cyc | |
positive: | |
.endmacro | |
; min 454 cycles | |
; max 756 cycles | |
.proc imul16 | |
arg1 = FR0 ; 16-bit arg (clobbered) | |
arg2 = FR1 ; 16-bit arg (clobbered) | |
result = FR2 ; 32-bit result | |
ldx #0 ; 2 cyc | |
; counts the number of sign bits in X | |
check_sign arg1 ; 5 to 25 cyc | |
check_sign arg2 ; 5 to 25 cyc | |
; zero out the 32-bit temp's top 16 bits | |
lda #0 ; 2 cyc | |
sta result + 2 ; 3 cyc | |
sta result + 3 ; 3 cyc | |
; the bottom two bytes will get cleared by the shifts | |
; unrolled loop for maximum speed, at the cost | |
; of a larger routine | |
; 424 to 672 cycles | |
.repeat 16, bitnum | |
; first half: 24 to 40 cycles | |
; second half: 29 to 44 cycles | |
bitmul16 arg1, arg2, result, bitnum | |
.endrepeat | |
; In case of mixed input signs, return a negative result. | |
cpx #1 ; 2 cyc | |
bne positive_result ; 2 cyc | |
neg32 result ; 34 cyc | |
positive_result: | |
rts ; 6 cyc | |
.endproc | |
.proc iter | |
; (cx and cy should be pre-scaled to 6.26 fixed point) | |
; zx = 0 | |
; zy = 0 | |
; zx_2 = 0 | |
; zy_2 = 0 | |
; zx_zy = 0 | |
; still working on the fixed-point | |
loop: | |
; iters++ | |
; 6.26: | |
; zx = zx_2 + zy_2 + cx | |
; zy = zx_zy + zx_zy + cy | |
; round to 6.10. | |
; 12.20: | |
; zx_2 = zx * zx | |
; zy_2 = zy * zy | |
; dist = zx_2 + zy_2 | |
; if dist >= 4 break, else continue iterating | |
; round zx_2, zy_2, dist to 6.26 | |
; if may be in the lake, look for looping output with a small buffer | |
; as an optimization vs running to max iters | |
.endproc | |
.proc start | |
loop: | |
; FR0 = 5 | |
; FR1 = -3 | |
lda #5 | |
sta FR0 | |
lda #0 | |
sta FR0 + 1 | |
lda #$fd | |
sta FR1 | |
lda #$ff | |
sta FR1 + 1 | |
jsr imul16 | |
; should have 32-bit -15 in FR2 | |
jmp loop | |
.endproc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment