Simple JIT compiler to create a fixed-size page translation cache. Generates simple linear search code without branches. On small tables it costs about 16 cycles including the overhead of looping and calling from Lua.
Cache table layout:
struct tlb_entry { int64_t page, offset; }
JIT compiler source:
void tlb_emit(Dst_DECL, int size, int pagebits)
{
int i;
// Preconditions
// Arg 0 (rdi): Table address
// Arg 1 (rsi): Address
// R9 = page
| mov r9, rsi
| shr r9, pagebits // r9 = address >> pagebits
// RAX = result (or 0)
| xor rax, rax
for (i = 0; i < size; i++) {
// if table[i].page == r9 then rax = table[i].offset end
| cmp r9, qword[rdi + (i * 16)]
| cmove rax, qword[rdi + (i * 16) + 8]
}
// If rax contains an offset then add the base address
// rax = (rax == 0) ? rax : rax + address
| xor rcx, rcx
| test rax, rax
| cmovz rsi, rcx // if rax == 0 then rsi = 0 end
| add rax, rsi // rax = address + offset
| ret
}
Generated code for an 8-entry cache for 2MB page size. (Lookup costs approx. 16 cycles when called from Lua.)
00000000 4989F1 mov r9,rsi
00000003 49C1E915 shr r9,byte 0x15
00000007 4831C0 xor rax,rax
0000000A 4C3B0F cmp r9,[rdi]
0000000D 480F444708 cmovz rax,[rdi+0x8]
00000012 4C3B4F10 cmp r9,[rdi+0x10]
00000016 480F444718 cmovz rax,[rdi+0x18]
0000001B 4C3B4F20 cmp r9,[rdi+0x20]
0000001F 480F444728 cmovz rax,[rdi+0x28]
00000024 4C3B4F30 cmp r9,[rdi+0x30]
00000028 480F444738 cmovz rax,[rdi+0x38]
0000002D 4C3B4F40 cmp r9,[rdi+0x40]
00000031 480F444748 cmovz rax,[rdi+0x48]
00000036 4C3B4F50 cmp r9,[rdi+0x50]
0000003A 480F444758 cmovz rax,[rdi+0x58]
0000003F 4C3B4F60 cmp r9,[rdi+0x60]
00000043 480F444768 cmovz rax,[rdi+0x68]
00000048 4C3B4F70 cmp r9,[rdi+0x70]
0000004C 480F444778 cmovz rax,[rdi+0x78]
00000051 4831C9 xor rcx,rcx
00000054 4885C0 test rax,rax
00000057 480F44F1 cmovz rsi,rcx
0000005B 4801F0 add rax,rsi
0000005E C3 ret
cmovcc will always (modulo prediction… which does happen on a few uarch) load that address. I'm not sure that's a win, compared to only storing an address in RAX and loading if necessary.
That could be
cmovz rsi, rax
. It's also probably predictable, so a jcc may be quicker.