Last active
June 14, 2024 19:58
-
-
Save pietrobraione/895fe84dc61c8dfb6957f4be34dea3b2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This gist shows how to build an almost zero-assembly template jit compiler from a | |
* threaded interpreter. | |
* See http://eli.thegreenplace.net/2012/07/12/computed-goto-for-efficient-dispatch-tables | |
* for the original threaded code interpreter. To illustrate compilation of jumps I added | |
* a couple of jump bytecodes, and to illustrate compilation of primitives (with external | |
* function invocations) I added a print bytecode. | |
* Profiling is extremely rough: When a backjump is executed 1000 times the code between | |
* the jump bytecode and the target of the jump is jitted, and nothing else. | |
* Works with clang and gcc under macOS Sonoma (Apple silicon) and Ubuntu 22.04 (x86-64), | |
* tried on the following versions: | |
* | |
* $ clang --version | |
* Apple clang version 15.0.0 (clang-1500.3.9.4) | |
* Target: arm64-apple-darwin23.5.0 | |
* | |
* $ gcc -v | |
* Target: aarch64-apple-darwin23 | |
* gcc version 14.1.0 (Homebrew GCC 14.1.0_1) | |
* | |
* $ clang --version | |
* Ubuntu clang version 14.0.0-1ubuntu1.1 | |
* Target: x86_64-pc-linux-gnu | |
* | |
* $ gcc -v | |
* Target: x86_64-linux-gnu | |
* gcc version 11.4.0 (Ubuntu 11.4.0-1ubuntu1~22.04) | |
* | |
* It works even when compiled with optimizations, except with gcc -O1. | |
*/ | |
#define DEBUG | |
#define _DEFAULT_SOURCE | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <time.h> | |
#include <sys/mman.h> | |
/* constants for the bytecodes */ | |
#define OP_INC 0x0 | |
#define OP_DEC 0x1 | |
#define OP_MUL2 0x2 | |
#define OP_DIV2 0x3 | |
#define OP_PRN 0x4 | |
#define OP_JMP 0x5 | |
#define OP_BRZ 0x6 | |
#define OP_HALT 0x7 | |
#define RET 0x8 /* not a bytecode! */ | |
/* I2I_DISPATCH: from interpreted to interpreted | |
* A2C_DISPATCH: from interpreted/compiled to compiled | |
* C2I_DISPATCH: from compiled to interpreted | |
* I2A_DISPATCH: from interpreted to interpreted/compiled | |
* HALT: from interpreted/compiled to interpreted (only return block) | |
*/ | |
#define I2I_DISPATCH goto *dispatch_table[code[pc]] | |
#if defined(__aarch64__) | |
#define A2C_DISPATCH __asm__ __volatile__("mov x11, %0 \nbr x11\n" : : "r" (_tgt_table[pc]), "r" (pc), "r" (_tgt_table) : "x11") | |
#define C2I_DISPATCH __asm__ __volatile__("mov x11, %0 \nbr x11\n" : : "r" (_dispatch_table[code[pc]]), "r" (pc), "r" (code), "r" (_dispatch_table) : "x11") | |
#define HALT __asm__ __volatile__("mov x11, %0 \nbr x11\n" : : "r" (_dispatch_table[RET]), "r" (_dispatch_table) : "x11") | |
#elif defined(__i386__) || defined(__x86_64__) | |
#define A2C_DISPATCH __asm__ __volatile__("jmp *%0" : : "r" (_tgt_table[pc]), "r" (pc), "r" (_tgt_table)) | |
#define C2I_DISPATCH __asm__ __volatile__("jmp *%0" : : "r" (_dispatch_table[code[pc]]), "r" (pc), "r" (code), "r" (_dispatch_table)) | |
#define HALT __asm__ __volatile__("jmp *%0" : : "r" (_dispatch_table[RET]), "r" (_dispatch_table)) | |
#else | |
#error "Architecture not supported" | |
#endif | |
#define I2A_DISPATCH \ | |
if (_tgt_table != 0 && _tgt_table[pc] != 0) { \ | |
A2C_DISPATCH; \ | |
} else { \ | |
I2I_DISPATCH; \ | |
} | |
/* pointers to binary code templates | |
* (bytecodes and trampolines) | |
*/ | |
void* *entry_bytecode; | |
void* *exit_bytecode; | |
void* entry_jump_trampoline; | |
void* exit_jump_trampoline; | |
void* entry_breakout_trampoline; | |
void* exit_breakout_trampoline; | |
/* how big can be a code template? */ | |
unsigned long max_template_size; | |
/* outputs of the compiler */ | |
/* the root of the compiled code; this pointer is not used right now, | |
* but it could be used to unmap the allocated memory | |
*/ | |
char *binary = 0; | |
/* the size of the memory area where the | |
* compiled code resides | |
*/ | |
size_t binary_size = 0; | |
/* the size of the compiled code; it can | |
* be less than binary_size | |
*/ | |
size_t compiled_code_size = 0; | |
/* associates each bytecode in the source code with the address of its | |
* translation in the binary; necessary for jumps | |
*/ | |
void* *tgt_table; | |
/* forward declarations */ | |
static void calc_max_template_size(void); | |
static void compile(char*, unsigned int, unsigned int, unsigned int); | |
/* some useful private macros */ | |
#define COPY_TEMPLATE(which) \ | |
do { \ | |
for (char *from = (char *) entry_ ## which; \ | |
from != (char *) exit_ ## which; ++from, ++to) { \ | |
*to = *from; \ | |
++_compiled_code_size; \ | |
} \ | |
} while (0) | |
#define DUMP_BINARY \ | |
do { \ | |
printf("\n"); \ | |
int i; \ | |
for (i = 0; i < compiled_code_size; ++i) { \ | |
printf("%02hhX ", binary[i]); \ | |
if (i % 16 == 15) { \ | |
printf("\n"); \ | |
} \ | |
} \ | |
if (i % 16 != 0) { \ | |
printf("\n"); \ | |
} \ | |
printf("\n"); \ | |
} while (0) | |
#define DUMP_SOURCE \ | |
do { \ | |
for (int i = 0; i < code_size; ++i) { \ | |
char* dis[] = { "inc", "dec", "mul2", "div2", "prn", "jmp", "brz", "halt" }; \ | |
printf("%d %s", i, dis[code[i]]); \ | |
if (code[i] == OP_JMP || code[i] == OP_BRZ) { \ | |
printf(" %d", code[++i]); \ | |
} \ | |
printf("\n");\ | |
} \ | |
printf("\n"); \ | |
} while (0) | |
/* executes code (interpreter + jit) | |
* | |
* code: pointer to a bufffer containing the array of bytecodes | |
* to be executed | |
* code_size: the size of the code parameter | |
* initval: the initial value of the accumulator register of the virtual machine | |
* do_jit: 1 if we want to activate the jit compiler, 0 if we want a pure | |
* interpretation of the code | |
*/ | |
int exec(char* code, unsigned int code_size, int initval, int do_jit) { | |
/* The indices of labels in the dispatch_table are the relevant opcodes */ | |
static void* dispatch_table[] = { | |
&&do_inc, &&do_dec, &&do_mul2, &&do_div2, &&do_prn, &&do_jmp, &&do_brz, &&do_halt, &&do_ret}; | |
static void* _exit_bytecode[] = { | |
&&do_inc_end, &&do_dec_end, &&do_mul2_end, &&do_div2_end, &&do_prn_end, &&do_jmp_end, &&do_brz_end, &&do_halt_end}; | |
/* prepares the global environment for compiler (just once) */ | |
static int do_global_init = 1; | |
if (do_global_init) { | |
entry_bytecode = dispatch_table; | |
exit_bytecode = _exit_bytecode; | |
entry_jump_trampoline = &&jump_trampoline; | |
exit_jump_trampoline = &&jump_trampoline_end; | |
entry_breakout_trampoline = &&breakout_trampoline; | |
exit_breakout_trampoline = &&breakout_trampoline_end; | |
calc_max_template_size(); | |
do_global_init = 0; | |
} | |
/* these pointers are necessary to force the use | |
* of absolute addressing in jitted code | |
*/ | |
void* *_dispatch_table = dispatch_table; | |
void* *_tgt_table = tgt_table; | |
const char *_print_template = "*** %d ***\n"; | |
int (*_printf)(const char *, ...) = printf; | |
/* this is for profiling */ | |
unsigned int profile_counters[code_size]; | |
for (int i = 0; i < code_size; ++i) { | |
profile_counters[i] = 0; | |
} | |
/* inits the intepreter */ | |
unsigned int pc = 0; /* the program counter */ | |
unsigned int old_pc; /* used for jitting */ | |
int val = initval; /* the (only) register of the virtual machine */ | |
/* starts */ | |
I2A_DISPATCH; | |
/* some code templates for the compiler, | |
* not used by the interpreter | |
*/ | |
jump_trampoline: | |
if (_tgt_table[pc] != 0) { | |
A2C_DISPATCH; | |
} else { | |
C2I_DISPATCH; | |
} | |
jump_trampoline_end: | |
breakout_trampoline: | |
C2I_DISPATCH; | |
breakout_trampoline_end: | |
/* here come all the bytecode operations */ | |
do_inc: | |
++val; | |
++pc; | |
do_inc_end: | |
I2A_DISPATCH; | |
do_dec: | |
--val; | |
++pc; | |
do_dec_end: | |
I2A_DISPATCH; | |
do_mul2: | |
val *= 2; | |
++pc; | |
do_mul2_end: | |
I2A_DISPATCH; | |
do_div2: | |
val /= 2; | |
++pc; | |
do_div2_end: | |
I2A_DISPATCH; | |
do_prn: | |
_printf(_print_template, val); | |
++pc; | |
do_prn_end: | |
I2A_DISPATCH; | |
do_jmp: | |
/* profiling... */ | |
if (code[pc + 1] < 0) { | |
++profile_counters[pc]; | |
} | |
old_pc = pc; | |
/* ...and actual bytecode semantics */ | |
pc += code[pc + 1]; | |
do_jmp_end: | |
if (do_jit && profile_counters[old_pc] > 1000) { | |
compile(code, code_size, pc, old_pc); | |
do_jit = 0; /* no more jit after the first one! */ | |
} | |
I2A_DISPATCH; | |
do_brz: | |
if (val == 0) { | |
pc += code[pc + 1]; | |
} else { | |
pc += 2; | |
} | |
do_brz_end: | |
I2A_DISPATCH; | |
do_halt: | |
HALT; | |
do_halt_end: | |
do_ret: | |
return val; /* this MUST be the last statement, or the block will not be contiguous! */ | |
} | |
/* calculates the maximum size of the binary code blocks | |
* that implement the semantics of the bytecodes | |
*/ | |
static void calc_max_template_size(void) { | |
max_template_size = 0; | |
for (int i = OP_INC; i < RET; ++i) { | |
long binary_block_size = exit_bytecode[i] - entry_bytecode[i]; | |
if (binary_block_size > max_template_size) { | |
max_template_size = binary_block_size; | |
} | |
} | |
/* (to be safe we should also consider the size of | |
* the trampolines) | |
*/ | |
} | |
/* compiles to binary */ | |
void compile(char *code, unsigned int code_size, unsigned int start, unsigned int end) { | |
#if defined DEBUG | |
clock_t start_t, end_t; | |
printf("Compiling...\n"); | |
start_t = clock(); | |
#endif | |
/* does some basic sanity checks */ | |
if (end > code_size || start >= end) { | |
printf("Compile - Invalid parameter\n"); | |
return; | |
} | |
/* allocates a big enough buffer */ | |
size_t code_buffer_size = (end - start) * sizeof(char) * max_template_size; | |
char *code_buffer = (char *) mmap(0, code_buffer_size, | |
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | |
if (code_buffer == MAP_FAILED) { | |
printf("Compile - Failed allocation of code buffer\n"); | |
return; | |
} | |
/* allocates and initializes the target table */ | |
void* *_tgt_table = malloc(code_size * sizeof(void *)); | |
if (_tgt_table == 0) { | |
printf("Compile - Failed allocation of target table\n"); | |
munmap((void *) code_buffer, code_buffer_size); | |
return; | |
} | |
for (unsigned long long i = 0; i < code_size; ++i) { | |
_tgt_table[i] = 0; | |
} | |
/* dispatches on source code and copies the | |
* corresponding blocks in a buffer; at the | |
* same times builds the target table | |
*/ | |
char *to = code_buffer; | |
size_t _compiled_code_size = 0; | |
for (unsigned long long pc = start; pc < end; ++pc) { | |
char cur_bytecode = code[pc]; | |
/* updates _tgt_table */ | |
_tgt_table[pc] = to; | |
/* appends in code_buffer the translation of the bytecode */ | |
COPY_TEMPLATE(bytecode[cur_bytecode]); | |
/* if the bytecode is jump/brz also appends the trampoline, | |
* and increments pc by one, to keep into account for the operand | |
*/ | |
if (cur_bytecode == OP_JMP || cur_bytecode == OP_BRZ) { | |
COPY_TEMPLATE(jump_trampoline); | |
++pc; | |
} | |
/* if this is the last bytecode and it is not a halt or | |
* a jump, appends the breakout trampoline | |
*/ | |
if (pc == end - 1 && | |
cur_bytecode != OP_HALT && | |
cur_bytecode != OP_JMP && | |
cur_bytecode != OP_BRZ) { | |
COPY_TEMPLATE(breakout_trampoline); | |
} | |
} | |
/* makes the buffer executable */ | |
mprotect(code_buffer, _compiled_code_size, PROT_READ | PROT_EXEC); | |
/* deletes the old compiled code | |
* and target table and replaces | |
* them with the new ones | |
*/ | |
if (binary != 0) { | |
munmap((void *) binary, binary_size); | |
} | |
binary = code_buffer; | |
binary_size = code_buffer_size; | |
compiled_code_size = _compiled_code_size; | |
if (tgt_table != 0) { | |
free(tgt_table); | |
} | |
tgt_table = _tgt_table; | |
#if defined DEBUG | |
end_t = clock(); | |
printf("Compilation time: %f sec\n", ((double) (end_t - start_t)) / CLOCKS_PER_SEC); | |
printf("Binary:\n"); | |
DUMP_BINARY; | |
#endif | |
} | |
int main(void) { | |
unsigned int code_size; | |
char *code; | |
unsigned int code_start_jit, code_end_jit; | |
int initval; | |
#if 0 | |
/* Example 1: random straight-line code (no jumps to avoid | |
* divergence) | |
*/ | |
code_size = 10000; | |
code = malloc(code_size * sizeof(char)); | |
srand(2); | |
for (unsigned long long i = 0; i < code_size - 1; ++i) { | |
code[i] = (rand() % 5); /* all bytecodes except jumps and halt */ | |
} | |
code[code_size - 1] = OP_HALT; | |
code_start_jit = 0; code_end_jit = code_size; | |
initval = 0; | |
#endif | |
#if 0 | |
/* Example 2: a simple brz instruction */ | |
code_size = 4; | |
code = malloc(code_size * sizeof(char)); | |
code[0] = OP_BRZ; | |
code[1] = 3; | |
code[2] = OP_INC; | |
code[3] = OP_HALT; | |
code_start_jit = 0; code_end_jit = code_size; | |
initval = 0; | |
#endif | |
/* Example 3: a long loop, jits the | |
* loop body | |
*/ | |
code_size = 19; | |
code = malloc(code_size * sizeof(char)); | |
code[0] = OP_BRZ; | |
code[1] = 18; /* goes to halt */ | |
code[2] = OP_DEC; | |
code[3] = OP_DEC; | |
code[4] = OP_DEC; | |
code[5] = OP_DEC; | |
code[6] = OP_DEC; | |
code[7] = OP_DEC; | |
code[8] = OP_DEC; | |
code[9] = OP_DEC; | |
code[10] = OP_DEC; | |
code[11] = OP_DEC; | |
code[12] = OP_INC; | |
code[13] = OP_DEC; | |
code[14] = OP_INC; | |
code[15] = OP_DEC; /* subtracts 10 to the register at each iteration*/ | |
code[16] = OP_JMP; | |
code[17] = -16; /* goes to brz */ | |
code[18] = OP_HALT; | |
code_start_jit = 2; code_end_jit = 15; | |
initval = 1000000; | |
#if defined DEBUG | |
printf("Bytecode:\n"); | |
DUMP_SOURCE; | |
#endif | |
clock_t start_t, end_t; | |
int result; | |
/* first runs the code with the threaded interpreter... */ | |
start_t = clock(); | |
result = exec(code, code_size, initval, 0); | |
end_t = clock(); | |
printf("Interpreted execution, result: %d, time: %f sec\n", result, ((double) (end_t - start_t)) / CLOCKS_PER_SEC); | |
/* ..then reruns with jit enabled */ | |
start_t = clock(); | |
result = exec(code, code_size, initval, 1); | |
end_t = clock(); | |
printf("Jitted execution, result: %d, time: %f sec\n", result, ((double) (end_t - start_t)) / CLOCKS_PER_SEC); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment