Last active
February 25, 2022 04:08
-
-
Save ped7g/1602d2e165850b55f6a52749d9811544 to your computer and use it in GitHub Desktop.
Z80 asm Erastothenes sieve to calculate prime numbers - part two, hunting for performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Author: Ped7g ; (C) 2022 ; license: MIT / GPL / Public Domain (pick whichever fits best for you) | |
; assembled with: https://github.com/z00m128/sjasmplus/ | |
; based on https://github.com/MartinezTorres/z80_babel project, written as comparison | |
; | |
; This is second version of the sieve routine (asm-like asm, focusing on performance, hard-wired buffer, etc...) | |
; For first C-like version check: https://gist.github.com/ped7g/c55bfa0d55ca13ce029549636cdd1de5 | |
; | |
; sieve routine sieve_a_2 starts around line 110, below test harness | |
; comment/uncomment the case you wan to debug: | |
primes_size EQU 3100 | |
work_size EQU 12998 /* or 12999*/ ; enough for 2860 primes, ending with 25999 (same as 26000 buffer for sieve_c_1) | |
do_case EQU caseCrc ; verification of actual primes list (2860 primes CRCed) | |
; work_size EQU 1024 ; like 2048 for sieve_c_1 ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039) | |
; do_case EQU caseZ80babel ; takes approx 46ms to finish | |
; work_size EQU 430 ; hand tuned to fit single frame in CSpect emulator | |
; do_case EQU caseTime | |
; ; sieve_a_1 fits 1 frame in CSpect at work_size 500, primes: 0x005F (95), last prime 0x01F3 (499) | |
; ; sieve_a_2 fits 1 frame in CSpect at work_size 430, primes: 0x0096 (150), last prime 0x035F (863) | |
DEFINE ORG_ADR $8080 ; using only uncontended faster memory (can't fit the maximum size buffer for case below) | |
; primes_size EQU 4 ; 8 bytes ahead of "work" buffer, using overlap of the two | |
; work_size EQU 32766 ; check all 16b numbers up to 65535 | |
; do_case EQU caseZ80babel ; 6542 primes, last prime 65521, takes about 1.8sec to generate | |
; | |
; DEFINE ORG_ADR $7E00 ; for work_size 32766, to have enough room for work buffer | |
;---------------------------------------------------------------------------------------------------------------------- | |
; test harness to debug the code in emulator | |
OPT --syntax=abf | |
DEVICE ZXSPECTRUM48,ORG_ADR-1 | |
ORG ORG_ADR | |
test_start: | |
jp do_case | |
caseZ80babel: | |
ld a,6 | |
out (254),a | |
call sieve_a_2 | |
nop ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039) | |
ld a,4 | |
out (254),a | |
jr $ | |
caseCrc: | |
ld a,5 | |
out (254),a | |
call sieve_a_2 | |
nop ; expected result HL=0x0B2C (2860), primes[0] = ..., 0x658F (25999) | |
; check result if it's as expected | |
ld a,6 | |
out (254),a | |
push hl | |
add hl,hl | |
ld bc,hl ; fake | |
ld de,0 ; crc add:xor | |
ld hl,primes | |
.crc | |
ld a,e | |
xor (hl) | |
ld e,a | |
xor d | |
add a,(hl) | |
ld d,a | |
inc hl | |
dec bc | |
ld a,b | |
or c | |
jr nz,.crc | |
ld a,4 | |
ld hl,0x0DEB | |
or a | |
sbc hl,de | |
jr z,.ok0 | |
ld a,2 | |
.ok0: | |
pop hl | |
ld de,0x0B2C | |
or a | |
sbc hl,de | |
jr z,.ok1 | |
ld a,2 | |
.ok1: | |
ld hl,25999 | |
ld de,(ix-2) ; fake | |
or a | |
sbc hl,de | |
jr z,.ok2 | |
ld a,2 | |
.ok2: | |
out (254),a | |
jr $ | |
caseTime: | |
ei | |
halt | |
ld a,5 | |
out (254),a | |
call sieve_a_2 | |
xor a | |
out (254),a | |
jr caseTime | |
;---------------------------------------------------------------------------------------------------------------------- | |
; 138B hand written asm sieve optimised for performance, taking many shortcuts, using odd-number-only sieve | |
; "work" can overlap "primes" starting from primes+8 address (to maximise amount of primes calculated and returned) | |
sieve_a_2: | |
ASSERT high work && 2 <= work_size | |
ASSERT (primes + 8 <= work) || (work + work_size <= primes) | |
ASSERT work + work_size <= 0xFFFF ; if work buffer would have last byte at 0xFFFF, the calculation of "i" breaks | |
; logical : memory address offset into work buffer, storing only odd numbers sieve | |
; work[3] : -1 (not stored in memory at all) | |
; work[5] : +0 | |
; work[7] : +1 | |
; work[9] : +2 | |
; *p++ = 2; | |
ld ix,primes | |
xor a | |
ld (ix+0),2 | |
ld (ix+1),a | |
inc ix | |
inc ix | |
; clear the work buffer (full clear, first byte of buffer is marker for prime 5) | |
; for (uint16_t i = 0; i < work_size; i++) work[i] = 0x00; | |
ld hl,work | |
ld de,work+1 | |
ld bc,work_size-1 | |
ld (hl),a | |
ldir | |
ld hl,work | |
ld bc,work_size | |
; mark prime 3 without searching by starting .mark_and_loop at work[5] | |
.mark_and_loop: | |
; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0) | |
ex de,hl | |
; i = 2*(hl - work + 1) + 1 | |
ld hl,1-work | |
add hl,de ; always cf=1 when 0xFFFF is not part of work buffer | |
adc hl,hl | |
; *p++ = i; | |
ld (ix+0),l | |
ld (ix+1),h | |
inc ix | |
inc ix | |
; IX = p, HL = i, DE = work[i+2], BC = bytes to check (BTW DE+BC = work.end(), but unable to use it) | |
; mark work buffer at all multiplies of `i` (inner loop setting the sieve) | |
; for (uint16_t j = i + i + i; j < work_size; j += i+i) work[j] = 1; | |
push bc ; preserve bytes to check | |
push de ; preserve work[i+2] | |
ld bc,-(work+work_size) | |
add hl,bc ; HL = i-(work+work_size) ; constant to check buffer overrun and add i+i to "j" | |
jr c,.inner_setup_ov ; when this happens, it happens for all following primes -> faster final loop | |
ex de,hl | |
dec hl ; HL = work[i], DE = i-(work+work_size), BC = -(work+work_size) | |
add hl,de ; j = i + i + i (adjusting work[j] address), plus test for work_size, must not overflow | |
jr c,.inner_setup_ov ; when this happens, it happens for all following primes -> faster final loop | |
.inner_loop: | |
sbc hl,bc ; restore hl to hl+j | |
ld (hl),h ; work[j] = non_zero_value | |
add hl,de ; hl += i + test for work_size, must not overflow | |
jp nc,.inner_loop | |
pop hl ; restore work[i+2] pointer = next to check | |
pop bc ; restore bytes to check | |
; outer_loop ; for (uint16_t i = 3; i < work_size; i+=2) | |
cpir | |
jp pe,.mark_and_loop ; zf=1, pv=1 | |
; pv=0 case is UNREACHABLE ; should never happen, because sooner the inner loop setup will overflow | |
; outer_loop phase 2, when marking primes is always impossible, (j >= work_size) == true because of large i | |
.inner_setup_ov: | |
pop hl ; restore work[i+2] pointer = next to check | |
pop bc ; restore bytes to check | |
jp .final_loop_entry | |
.final_loop: | |
; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0) | |
ex de,hl | |
; i = 2*(hl - work + 1) + 1 | |
ld hl,1-work | |
add hl,de ; always cf=1 when 0xFFFF is not part of work buffer | |
adc hl,hl | |
; *p++ = i; | |
ld (ix+0),l | |
ld (ix+1),h | |
inc ix | |
inc ix | |
ex de,hl ; restore HL to work[i+2] and continue with final loop | |
.final_loop_entry: | |
cpir | |
jp pe,.final_loop ; zf=1, pv=1 | |
jr nz,.final_loop_done ; zf=0, pv=0 (last work[i] was not prime) | |
; zf=1, pv=0 (match at last byte) - store last prime | |
ld de,1-work | |
add hl,de ; always cf=1 when 0xFFFF is not part of work buffer | |
adc hl,hl | |
; *p++ = i; | |
ld (ix+0),l | |
ld (ix+1),h | |
inc ix | |
inc ix | |
.final_loop_done: | |
; return p-primes; | |
ld a,ixl | |
sub low(primes) | |
ld l,a | |
ld a,ixh | |
sbc a,high(primes) | |
rra | |
ld h,a | |
rr l ; HL = number of primes | |
ret | |
DISPLAY "sieve_a_2 code size: ",/A,$-sieve_a_2 | |
;---------------------------------------------------------------------------------------------------------------------- | |
; "primes" and "work" buffers | |
primes: ds 2*primes_size,$AA | |
work: ds work_size,$BB | |
db 'X' ; the work buffer can't use 0xFFFF address, so this is guardian byte making assembling fail | |
SAVESNA "sieve2.sna", test_start ;; DEBUG snapshot |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment