Created
May 31, 2020 13:35
-
-
Save hcs64/68b8cd7fa167880cd4f975c3d458c9ff to your computer and use it in GitHub Desktop.
4bpp and 8bpp conversions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// RSP ucode for converting 2bpp NES to 4bpp and 8bpp N64 | |
// TODO sprite priority | |
align(8) | |
scope Ucode: { | |
InitialDMEM: | |
pushvar base | |
base 0x0000 | |
// Unfortunately it isn't possible to use constants for vector register names(?) | |
// v31 | |
Zeroes: | |
fill 8*2,0 | |
// v10 | |
BitsOfBytes: | |
dh 0x0101, 0x0202, 0x0404, 0x0808 | |
dh 0x1010, 0x2020, 0x4040, 0x8080 | |
macro select_bits(src) { | |
// select bits | |
vand v7,{src},v10[e15] | |
vand v6,{src},v10[e14] | |
vand v5,{src},v10[e13] | |
vand v4,{src},v10[e12] | |
vand v3,{src},v10[e11] | |
vand v2,{src},v10[e10] | |
vand v1,{src},v10[e9] | |
vand v0,{src},v10[e8] | |
} | |
// 4bpp | |
// Pseudo shifts (and adds) to put bits in 12,11 and 8,7 (for sfv) | |
// v8 | |
ShiftMux0: | |
dh 1<<(16+( 7- 8)) // -16 | |
//dh 1<< (11- 9) // unused, combined with mux1 | |
dh 0b0100'0100 // used to promote attributes (4bpp) | |
dh 1<<(16+( 7-10)) // -16 | |
dh 1<<(16-(8-2)) // used to promote attributes (8bpp) | |
//dh 1<< (11-11) // unused, combined with mux1 | |
dh 1<<(16+( 7-12)) // -16 | |
dh 1<<(16+(11-13)) // -16 | |
dh 1<<(16+( 7-14)) // -16 | |
dh 1<<(16+(11-15)) // -16 | |
// v9 | |
ShiftMux1: | |
dh 1<<( 8- 0) | |
dh (1<<(12- 1))|(1<<(11-9)) | |
dh 1<<( 8- 2) | |
dh (1<<(12- 3))|(1<<(11-11)) | |
dh 1<<( 8- 4) | |
dh 1<<(12- 5) | |
dh 1<<( 8- 6) | |
dh 1<<(12- 7) | |
// 8bpp | |
// Pseudo shifts to put bits in 9,8 and 1,0 (for sdv) | |
// v29 | |
ShiftMuxSp0: | |
// ....'...A ....'...B | |
// | |
// ....'.... ....'...A | |
// + ....'..A. ....'..B. | |
// = ....'..A. ....'..BA | |
dh 0x10000>>-( 0- 8) // >>8 | <<1 | |
// ....'..A. ....'..B. | |
// | |
// ....'...A ....'...B | |
// + ....'..B. ....'.... | |
// = ....'..BA ....'...B | |
dh 0x10000>>-( 8- 9) // >>1 | <<8 | |
// ....'.A.. ....'.B.. | |
// | |
// ....'.... ....'...A | |
// + ....'..A. ....'..B. | |
// = ....'..A. ....'..BA | |
dh (0x10000>>-( 0-10))|(0x10000>>-( 1- 2)) // >>10 | >> 1 | |
// ....'A... ....'B... | |
// | |
// ....'...A ....'...B | |
// + ....'..B. ....'.... | |
// = ....'..BA ....'...B | |
dh 0x10000>>-( 8-11) // >>3 | << 6 | |
// ...A'.... ...B'.... | |
// | |
// ....'.... ....'...A | |
// + ....'..A. ....'..B. | |
// = ....'..A. ....'..BA | |
dh (0x10000>>-( 0-12))|(0x10000>>-( 1- 4)) // >>12 | >> 3 | |
// ..A.'.... ..B.'.... | |
// | |
// ....'...A ....'...B | |
// + ....'..B. ....'.... | |
// = ....'..BA ....'...B | |
dh 0x10000>>-( 8-13) // >>5 | << 4 | |
// .A..'.... .B..'.... | |
// | |
// ....'.... ....'...A | |
// + ....'..A. ....'..B. | |
// = ....'..A. ....'..BA | |
dh (0x10000>>-( 0-14))|(0x10000>>-( 1- 6)) // >>14 | >> 5 | |
// A...'.... B...'.... | |
// | |
// ....'...A ....'...B | |
// + ....'..B. ....'.... | |
// = ....'..BA ....'...B | |
dh 0x10000>>-( 8-15) // >>7 | << 2 | |
// v30 | |
ShiftMuxSp1: | |
dh 1<< ( 1- 0) // left | |
dh 1<< ( 9- 1) // left | |
dh 0x10000>>-( 1- 2) // right (unused) | |
dh 1<< ( 9- 3) // left | |
dh 0x10000>>-( 1- 4) // right (unused) | |
dh 1<< ( 9- 5) // left | |
dh 0x10000>>-( 1- 6) // right (unused) | |
dh 1<< ( 9- 7) // left | |
// v28 | |
Masks: | |
dh 0x0300, 0x0003, 0, 0 | |
dh 0,0,0,0 | |
pullvar base | |
// IMEM | |
align(8) | |
IMEM: | |
pushvar base | |
base 0x0000 | |
constant dmem_src(0) | |
constant dmem_dst(conv_src_size) | |
arch n64.rsp | |
Boot: | |
- | |
mfc0 t0, C0_DMA_FULL | |
bnez t0,- | |
nop | |
mtc0 r0, C0_MEM_ADDR | |
la a0, Ucode.InitialDMEM | |
mtc0 a0, C0_DRAM_ADDR | |
lli t0, 0x1000-1 | |
mtc0 t0, C0_RD_LEN | |
- | |
mfc0 t0, C0_DMA_BUSY | |
bnez t0,- | |
nop | |
lqv v8[e0],ShiftMux0(r0) | |
lqv v9[e0],ShiftMux1(r0) | |
lqv v10[e0],BitsOfBytes(r0) | |
lqv v28[e0],Masks(r0) | |
lqv v29[e0],ShiftMuxSp0(r0) | |
lqv v30[e0],ShiftMuxSp1(r0) | |
lqv v31[e0],Zeroes(r0) | |
ResetFrame: | |
la a0, conv_src_buffer & 0x7f'ffff | |
la a1, conv_dst_buffer & 0x7f'ffff | |
break | |
SkipLines: | |
addi a0, conv_src_size | |
addi a1, conv_dst_size | |
break | |
ConvertLines: | |
- | |
mfc0 t0, C0_DMA_FULL | |
bnez t0,- | |
nop | |
lli t0, dmem_src | |
mtc0 t0, C0_MEM_ADDR | |
mtc0 a0, C0_DRAM_ADDR | |
lli t0, conv_src_size-1 | |
mtc0 t0, C0_RD_LEN | |
- | |
mfc0 t0, C0_DMA_BUSY | |
bnez t0,- | |
nop | |
lli a2, dmem_src + src_bg_pat | |
lli a3, dmem_dst + dst_bg | |
lli t8, dmem_src + src_bg_atr | |
lli t0, src_sp_pat - src_bg_pat | |
lqv v11[e0], 0(a2) // V3 = Tile BitPlane 0,1 Row 0..7 | |
bg_loop: | |
// We're doing the same operation for each of 8 8-pixel rows in v11. | |
// An element holds the high and low bitplane for one row. | |
// select bits | |
select_bits(v11) | |
// Prefetch the next tile | |
lqv v11[e0], 16(a2) // V3 = Tile BitPlane 0,1 Row 0..7 | |
addiu a2, 16 | |
// Column 7,6 | |
// The elements of v0-v7 now contain each bit 0-7 of both bitplanes. | |
// For columns 7 and 6, we want to go from | |
// AB.. .... CD.. .... | |
// to packed within 14-7 | |
// ...C A..D B... .... | |
// in order to be in place for sfv to write that as | |
// ..C A..D B | |
// This involves shifting A and B right by different amounts, and | |
// C and D left by different amounts, and finally combining them all. | |
// | |
// Since we have each bit of a byte in its own reg | |
// v7 = A... .... C... .... | |
// v6 = .B.. .... .D.. .... | |
// we want to do these shifts: | |
// (11-15) = >> 4 = .... A... .... C... | |
// (12- 7) = << 5 = ...C .... .... .... | |
// ( 7-14) = >> 7 = .... .... B... .... | |
// ( 8- 6) = << 2 = .... ...D .... .... | |
// Left shifts use vm?n, a multiply, which doesn't clamp until bit 31. | |
// Right shifts use vm?l, which shifts the multiply result down by 16. | |
// Occasionally (see cols 1 & 3) we can do two shifts together if both | |
// bits need to be shifted in the same direction. | |
// Since there is only one bit in each column (they were 8 bits apart | |
// so only one of each pair ends up in the 14-7 window) we can combine | |
// with the accumulator using the vma? ops. | |
// We don't need to worry about the bits outside of 14-7, as long | |
// as they don't carry into bit 7, which isn't the case here. | |
vmudl v12,v7,v8[e15] | |
vmadn v12,v7,v9[e15] | |
vmadl v12,v6,v8[e14] | |
vmadn v12,v6,v9[e14] | |
// Column 5,4 | |
vmudl v13,v5,v8[e13] | |
vmadn v13,v5,v9[e13] | |
vmadl v13,v4,v8[e12] | |
vmadn v13,v4,v9[e12] | |
// Load attribute bits | |
luv v16[e0], 0(t8) | |
// Column 3,2 | |
vmudn v14,v3,v9[e11] | |
vmadl v14,v2,v8[e10] | |
vmadn v14,v2,v9[e10] | |
// Column 1,0 | |
vmudn v15,v1,v9[e9] | |
vmadl v15,v0,v8[e8] | |
vmadn v15,v0,v9[e8] | |
// Add attribute bits to pixels | |
// These are in bytes as | |
// .... ..AB | |
// luv loads them as | |
// .... ...A B... .... | |
// and we want them at | |
// .AB. .AB. .... .... | |
// so they can end up, after sfv, as | |
// AB.. AB.. | |
// this is done by multiplying with 0b0100'0100, | |
// effectively (x<<2)|(x<<6), which is stashed in | |
// an otherwise unused element of ShiftMux0. | |
vmudn v16,v16,v8[e9] | |
// Each element of v16 holds doubled attributes for | |
// each row, combine with each pair of pixels. | |
vor v12,v12,v16[e0] | |
vor v13,v13,v16[e0] | |
vor v14,v14,v16[e0] | |
vor v15,v15,v16[e0] | |
//define HAS_SFV() | |
if {defined HAS_SFV} { | |
// Store Columns 7,6 | |
sfv v12[e0],0(a3) | |
sfv v12[e8],16(a3) | |
addi a3, 1 | |
// Store Columns 5,4 | |
sfv v13[e0],0(a3) | |
sfv v13[e8],16(a3) | |
addi a3, 1 | |
// Store Columns 3,2 | |
sfv v14[e0],0(a3) | |
sfv v14[e8],16(a3) | |
addi a3, 1 | |
// Store Columns 1,0 | |
sfv v15[e0],0(a3) | |
sfv v15[e8],16(a3) | |
addi a3, 1+(32-4) | |
} else { | |
macro sfv_sim(e) { | |
evaluate rep_i(0) | |
mfc2 t1,v12[{e}] | |
srl t2, t1, 7 | |
sb t2, 0(a3) | |
addiu a3, 1 | |
mfc2 t1,v13[{e}] | |
srl t2, t1, 7 | |
sb t2, 0(a3) | |
addiu a3, 1 | |
mfc2 t1,v14[{e}] | |
srl t2, t1, 7 | |
sb t2, 0(a3) | |
addiu a3, 1 | |
mfc2 t1,v15[{e}] | |
srl t2, t1, 7 | |
sb t2, 0(a3) | |
addiu a3, 1 | |
evaluate rep_i({rep_i}+1) | |
} | |
sfv_sim(e0) | |
sfv_sim(e2) | |
sfv_sim(e4) | |
sfv_sim(e6) | |
sfv_sim(e8) | |
sfv_sim(e10) | |
sfv_sim(e12) | |
sfv_sim(e14) | |
} | |
addi t0, -16 | |
bnez t0, bg_loop | |
addi t8, 8 | |
// Adjust for fine X scroll | |
scope FineXBG { | |
constant lines_left(sp_s0) | |
constant left_shift(sp_s1) | |
constant right_shift(sp_s2) | |
constant leftover(sp_s3) | |
constant tiles_left(sp_s4) | |
constant tile32(sp_s5) | |
// Working backwards in each line, shift each tile left, combine with the | |
// part that was shifted out of the previous tile. | |
lli a2, dmem_dst+dst_bg+(32*8-1)*4 | |
lli tile32, dmem_dst+dst_bg+(33*8-1)*4 | |
lli lines_left, 8-1 | |
shift_line_loop: | |
lbu left_shift, dmem_src+src_bg_x (lines_left) | |
lw leftover, 0 (tile32) | |
addi tile32, -4 | |
lli tiles_left, 32 | |
bnez left_shift,+ | |
sll left_shift, 2 | |
// srlv can only do up to 31 bits, so we can't do this for X=0 (it would be useless anyway) | |
j shift_tile_loop_end | |
addi a2, -32*4 | |
+ | |
// overloading tiles_left's 32 as 32 bits here | |
sub right_shift, tiles_left, left_shift | |
srlv leftover, right_shift | |
shift_tile_loop: | |
// SU loads take 3 cycles, so unroll these loops 4x | |
evaluate rep_i(0) | |
while {rep_i} < 4 { | |
evaluate src(t0 + {rep_i}) | |
lw {src}, -{rep_i} * 4 (a2) | |
evaluate rep_i({rep_i} + 1) | |
} | |
evaluate rep_i(0) | |
while {rep_i} < 4 { | |
evaluate src(t0 + {rep_i}) | |
evaluate shifted(t4 + {rep_i}) | |
sllv {shifted}, {src}, left_shift | |
or {shifted}, leftover | |
srlv leftover, {src}, right_shift | |
sw {shifted}, -{rep_i} * 4 (a2) | |
evaluate rep_i({rep_i} + 1) | |
} | |
addi tiles_left, -4 | |
bnez tiles_left, shift_tile_loop | |
addi a2, -4*4 | |
shift_tile_loop_end: | |
bnez lines_left, shift_line_loop | |
addi lines_left, -1 | |
} // end scope FineXBG | |
// DMA out BG | |
lli t0, dmem_dst | |
mtc0 t0, C0_MEM_ADDR | |
mtc0 a1, C0_DRAM_ADDR | |
lli t0, dst_sp-dst_bg-1 | |
mtc0 t0, C0_WR_LEN | |
addi a1, dst_sp-dst_bg | |
// There's only one port to DMEM, so it's a good idea to wait for this DMA to | |
// finish before proceeding to the sprites. | |
- | |
mfc0 t0, C0_DMA_BUSY | |
bnez t0,- | |
nop | |
// ##### Sprites | |
// Zero out the lines | |
lli t0, dmem_dst+0x80 | |
lli t1, 256*8 | |
- | |
sqv v31[e0],0(t0) | |
addi t1, -16 | |
bnez t1,- | |
addi t0, 16 | |
// Convert to 8bpp | |
lli a2, dmem_src+src_sp_pat | |
lli a3, dmem_src+src_sp_atr | |
lli sp_s1, 8 | |
lli sp_s2, dmem_src+src_sp_x | |
lli sp_s3, dmem_dst+0x80 | |
lqv v11[e0], 0(a2) | |
sprite_loop: | |
macro sprite_convert(_76,_54,_32,_10) { | |
// Column 7,6 | |
vmudl v13,v7,v29[e15] | |
vmadn v13,v7,v30[e15] | |
vand v13,v13,v28[e8] | |
vmudl v14,v6,v29[e14] | |
vand v14,v14,v28[e9] | |
vor {_76},v13,v14[e0] | |
// Column 5,4 | |
vmudl v13,v5,v29[e13] | |
vmadn v13,v5,v30[e13] | |
vand v13,v13,v28[e8] | |
vmudl v14,v4,v29[e12] | |
vand v14,v14,v28[e9] | |
vor {_54},v13,v14[e0] | |
// Column 3,2 | |
vmudl v13,v3,v29[e11] | |
vmadn v13,v3,v30[e11] | |
vand v13,v13,v28[e8] | |
vmudl v14,v2,v29[e10] | |
vand v14,v14,v28[e9] | |
vor {_32},v13,v14[e0] | |
// Column 1,0 | |
vmudl v13,v1,v29[e9] | |
vmadn v13,v1,v30[e9] | |
vand v13,v13,v28[e8] | |
vmudl v14,v0,v29[e8] | |
vmadn v14,v0,v30[e8] | |
vand v14,v14,v28[e9] | |
vor {_10},v13,v14[e0] | |
// Add attributes | |
vmudl v24,v24,v8[e11] | |
vmudn v24,v24,v10[e8] | |
vor v24,v24,v10[e12] // 0x1010, sprite palette | |
vor {_76},{_76},v24[e0] | |
vor {_54},{_54},v24[e0] | |
vor {_32},{_32},v24[e0] | |
vor {_10},{_10},v24[e0] | |
} | |
// First line (with 2 lines we can do a full transpose) | |
lpv v24[e0], 0(a3) // attributes | |
select_bits(v11) | |
// Preload next line | |
lqv v11[e0], 16(a2) | |
sprite_convert(v16,v17,v18,v19) | |
// Second line | |
lpv v24[e0], 8(a3) // attributes | |
select_bits(v11) | |
// Preload next line | |
lqv v11[e0], 32(a2) | |
sprite_convert(v20,v21,v22,v23) | |
// So now we have | |
// v16: pixel 0,1 of 8 sprites on line 0 | |
// v17: pixel 2,3 | |
// v18: pixel 4,5 | |
// v19: pixel 6,7 | |
// v20: pixel 0,1 of 8 sprites on line 1 | |
// v21: pixel 2,3, line 1 | |
// v22: pixel 4,5, line 1 | |
// v23: pixel 6,7, line 1 | |
// (0.0,0.1),... | |
// (0.2,0.3),... | |
// (0.4,0.5),... | |
// (0.6,0.7),... | |
// (8.0,8.1),... | |
// (8.2,8.3),... | |
// (8.4,8.5),... | |
// (8.6,8.7),... | |
// The idea is to transpose this to | |
// (0.0,0.1),(0.2,0.3),(0.4,0.5),(0.6,0.7),(8.0,8.1),(8.2,8.3),(8.4,8.5),(8.6,8.7) | |
// ... | |
// v16: line 0 sprite 0, line 1 sprite 0 | |
// v17: line 0 sprite 1, line 1 sprite 1 | |
// v18: line 0 sprite 2, line 1 sprite 2 | |
// v19: line 0 sprite 3, line 1 sprite 3 | |
// v20: line 0 sprite 4, line 1 sprite 4 | |
// v21: line 0 sprite 5, line 1 sprite 5 | |
// v22: line 0 sprite 6, line 1 sprite 6 | |
// v23: line 0 sprite 7, line 1 sprite 7 | |
lli t0, dmem_dst | |
// Note: I'm not 100% sure that this is the layout in DMEM. | |
// v17[0],v18[1],v19[2],v20[3],v21[4],v22[5],v23[6],v16[7] -> 0x70 | |
stv v16[e2], 0x70(t0) | |
stv v16[e4], 0x60(t0) | |
stv v16[e6], 0x50(t0) | |
stv v16[e8], 0x40(t0) | |
stv v16[e10], 0x30(t0) | |
stv v16[e12], 0x20(t0) | |
stv v16[e14], 0x10(t0) | |
// 0x70 -> v16[1],v17[2],v18[3],v19[4],v20[5],v21[6],v22[7],v23[0] | |
ltv v16[e14], 0x70(t0) | |
ltv v16[e12], 0x60(t0) | |
ltv v16[e10], 0x50(t0) | |
ltv v16[e8], 0x40(t0) | |
ltv v16[e6], 0x30(t0) | |
ltv v16[e4], 0x20(t0) | |
ltv v16[e2], 0x10(t0) | |
// Fill in each sprite | |
lbu t3, 7(sp_s2) | |
lbu t2, 6(sp_s2) | |
lbu t1, 5(sp_s2) | |
lbu t0, 4(sp_s2) | |
add t3, sp_s3 | |
sdv v23[e0], 0(t3) | |
add t2, sp_s3 | |
sdv v22[e0], 0(t2) | |
add t1, sp_s3 | |
sdv v21[e0], 0(t1) | |
add t0, sp_s3 | |
sdv v20[e0], 0(t0) | |
lbu t3, 3(sp_s2) | |
lbu t2, 2(sp_s2) | |
lbu t1, 1(sp_s2) | |
lbu t0, 0(sp_s2) | |
add t3, sp_s3 | |
sdv v19[e0], 0(t3) | |
add t2, sp_s3 | |
sdv v18[e0], 0(t2) | |
add t1, sp_s3 | |
sdv v17[e0], 0(t1) | |
add t0, sp_s3 | |
sdv v16[e0], 0(t0) | |
addi sp_s3, 256 | |
addi sp_s2, 8 | |
lbu t3, 7(sp_s2) | |
lbu t2, 6(sp_s2) | |
lbu t1, 5(sp_s2) | |
lbu t0, 4(sp_s2) | |
add t3, sp_s3 | |
sdv v23[e8], 0(t3) | |
add t2, sp_s3 | |
sdv v22[e8], 0(t2) | |
add t1, sp_s3 | |
sdv v21[e8], 0(t1) | |
add t0, sp_s3 | |
sdv v20[e8], 0(t0) | |
lbu t3, 3(sp_s2) | |
lbu t2, 2(sp_s2) | |
lbu t1, 1(sp_s2) | |
lbu t0, 0(sp_s2) | |
add t3, sp_s3 | |
sdv v19[e8], 0(t3) | |
add t2, sp_s3 | |
sdv v18[e8], 0(t2) | |
add t1, sp_s3 | |
sdv v17[e8], 0(t1) | |
add t0, sp_s3 | |
sdv v16[e8], 0(t0) | |
addi sp_s3, 256 | |
addi sp_s2, 8 | |
addi a2, 16*2 | |
addi sp_s1, -2 | |
bnez sp_s1,sprite_loop | |
addi a3, 8*2 | |
// DMA out sprites | |
lli t0, dmem_dst+0x80 | |
mtc0 t0, C0_MEM_ADDR | |
mtc0 a1, C0_DRAM_ADDR | |
lli t0, conv_dst_size-dst_sp-1 | |
mtc0 t0, C0_WR_LEN | |
- | |
mfc0 t0, C0_DMA_BUSY | |
bnez t0,- | |
nop | |
addi a1, conv_dst_size-dst_sp | |
addi a0, conv_src_size | |
break | |
} | |
pullvar base | |
arch n64.cpu | |
align(4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment