Skip to content

Instantly share code, notes, and snippets.

@pietrobraione
Last active June 14, 2024 19:58
Show Gist options
  • Save pietrobraione/895fe84dc61c8dfb6957f4be34dea3b2 to your computer and use it in GitHub Desktop.
Save pietrobraione/895fe84dc61c8dfb6957f4be34dea3b2 to your computer and use it in GitHub Desktop.
/* This gist shows how to build an almost zero-assembly template jit compiler from a
* threaded interpreter.
* See http://eli.thegreenplace.net/2012/07/12/computed-goto-for-efficient-dispatch-tables
* for the original threaded code interpreter. To illustrate compilation of jumps I added
* a couple of jump bytecodes, and to illustrate compilation of primitives (with external
* function invocations) I added a print bytecode.
* Profiling is extremely rough: When a backjump is executed 1000 times the code between
* the jump bytecode and the target of the jump is jitted, and nothing else.
* Works with clang and gcc under macOS Sonoma (Apple silicon) and Ubuntu 22.04 (x86-64),
* tried on the following versions:
*
* $ clang --version
* Apple clang version 15.0.0 (clang-1500.3.9.4)
* Target: arm64-apple-darwin23.5.0
*
* $ gcc -v
* Target: aarch64-apple-darwin23
* gcc version 14.1.0 (Homebrew GCC 14.1.0_1)
*
* $ clang --version
* Ubuntu clang version 14.0.0-1ubuntu1.1
* Target: x86_64-pc-linux-gnu
*
* $ gcc -v
* Target: x86_64-linux-gnu
* gcc version 11.4.0 (Ubuntu 11.4.0-1ubuntu1~22.04)
*
* It works even when compiled with optimizations, except with gcc -O1.
*/
#define DEBUG
#define _DEFAULT_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/mman.h>
/* constants for the bytecodes */
#define OP_INC 0x0
#define OP_DEC 0x1
#define OP_MUL2 0x2
#define OP_DIV2 0x3
#define OP_PRN 0x4
#define OP_JMP 0x5
#define OP_BRZ 0x6
#define OP_HALT 0x7
#define RET 0x8 /* not a bytecode! */
/* I2I_DISPATCH: from interpreted to interpreted
* A2C_DISPATCH: from interpreted/compiled to compiled
* C2I_DISPATCH: from compiled to interpreted
* I2A_DISPATCH: from interpreted to interpreted/compiled
* HALT: from interpreted/compiled to interpreted (only return block)
*/
#define I2I_DISPATCH goto *dispatch_table[code[pc]]
#if defined(__aarch64__)
#define A2C_DISPATCH __asm__ __volatile__("mov x11, %0 \nbr x11\n" : : "r" (_tgt_table[pc]), "r" (pc), "r" (_tgt_table) : "x11")
#define C2I_DISPATCH __asm__ __volatile__("mov x11, %0 \nbr x11\n" : : "r" (_dispatch_table[code[pc]]), "r" (pc), "r" (code), "r" (_dispatch_table) : "x11")
#define HALT __asm__ __volatile__("mov x11, %0 \nbr x11\n" : : "r" (_dispatch_table[RET]), "r" (_dispatch_table) : "x11")
#elif defined(__i386__) || defined(__x86_64__)
#define A2C_DISPATCH __asm__ __volatile__("jmp *%0" : : "r" (_tgt_table[pc]), "r" (pc), "r" (_tgt_table))
#define C2I_DISPATCH __asm__ __volatile__("jmp *%0" : : "r" (_dispatch_table[code[pc]]), "r" (pc), "r" (code), "r" (_dispatch_table))
#define HALT __asm__ __volatile__("jmp *%0" : : "r" (_dispatch_table[RET]), "r" (_dispatch_table))
#else
#error "Architecture not supported"
#endif
#define I2A_DISPATCH \
if (_tgt_table != 0 && _tgt_table[pc] != 0) { \
A2C_DISPATCH; \
} else { \
I2I_DISPATCH; \
}
/* pointers to binary code templates
* (bytecodes and trampolines)
*/
void* *entry_bytecode;
void* *exit_bytecode;
void* entry_jump_trampoline;
void* exit_jump_trampoline;
void* entry_breakout_trampoline;
void* exit_breakout_trampoline;
/* how big can be a code template? */
unsigned long max_template_size;
/* outputs of the compiler */
/* the root of the compiled code; this pointer is not used right now,
* but it could be used to unmap the allocated memory
*/
char *binary = 0;
/* the size of the memory area where the
* compiled code resides
*/
size_t binary_size = 0;
/* the size of the compiled code; it can
* be less than binary_size
*/
size_t compiled_code_size = 0;
/* associates each bytecode in the source code with the address of its
* translation in the binary; necessary for jumps
*/
void* *tgt_table;
/* forward declarations */
static void calc_max_template_size(void);
static void compile(char*, unsigned int, unsigned int, unsigned int);
/* some useful private macros */
#define COPY_TEMPLATE(which) \
do { \
for (char *from = (char *) entry_ ## which; \
from != (char *) exit_ ## which; ++from, ++to) { \
*to = *from; \
++_compiled_code_size; \
} \
} while (0)
#define DUMP_BINARY \
do { \
printf("\n"); \
int i; \
for (i = 0; i < compiled_code_size; ++i) { \
printf("%02hhX ", binary[i]); \
if (i % 16 == 15) { \
printf("\n"); \
} \
} \
if (i % 16 != 0) { \
printf("\n"); \
} \
printf("\n"); \
} while (0)
#define DUMP_SOURCE \
do { \
for (int i = 0; i < code_size; ++i) { \
char* dis[] = { "inc", "dec", "mul2", "div2", "prn", "jmp", "brz", "halt" }; \
printf("%d %s", i, dis[code[i]]); \
if (code[i] == OP_JMP || code[i] == OP_BRZ) { \
printf(" %d", code[++i]); \
} \
printf("\n");\
} \
printf("\n"); \
} while (0)
/* executes code (interpreter + jit)
*
* code: pointer to a bufffer containing the array of bytecodes
* to be executed
* code_size: the size of the code parameter
* initval: the initial value of the accumulator register of the virtual machine
* do_jit: 1 if we want to activate the jit compiler, 0 if we want a pure
* interpretation of the code
*/
int exec(char* code, unsigned int code_size, int initval, int do_jit) {
/* The indices of labels in the dispatch_table are the relevant opcodes */
static void* dispatch_table[] = {
&&do_inc, &&do_dec, &&do_mul2, &&do_div2, &&do_prn, &&do_jmp, &&do_brz, &&do_halt, &&do_ret};
static void* _exit_bytecode[] = {
&&do_inc_end, &&do_dec_end, &&do_mul2_end, &&do_div2_end, &&do_prn_end, &&do_jmp_end, &&do_brz_end, &&do_halt_end};
/* prepares the global environment for compiler (just once) */
static int do_global_init = 1;
if (do_global_init) {
entry_bytecode = dispatch_table;
exit_bytecode = _exit_bytecode;
entry_jump_trampoline = &&jump_trampoline;
exit_jump_trampoline = &&jump_trampoline_end;
entry_breakout_trampoline = &&breakout_trampoline;
exit_breakout_trampoline = &&breakout_trampoline_end;
calc_max_template_size();
do_global_init = 0;
}
/* these pointers are necessary to force the use
* of absolute addressing in jitted code
*/
void* *_dispatch_table = dispatch_table;
void* *_tgt_table = tgt_table;
const char *_print_template = "*** %d ***\n";
int (*_printf)(const char *, ...) = printf;
/* this is for profiling */
unsigned int profile_counters[code_size];
for (int i = 0; i < code_size; ++i) {
profile_counters[i] = 0;
}
/* inits the intepreter */
unsigned int pc = 0; /* the program counter */
unsigned int old_pc; /* used for jitting */
int val = initval; /* the (only) register of the virtual machine */
/* starts */
I2A_DISPATCH;
/* some code templates for the compiler,
* not used by the interpreter
*/
jump_trampoline:
if (_tgt_table[pc] != 0) {
A2C_DISPATCH;
} else {
C2I_DISPATCH;
}
jump_trampoline_end:
breakout_trampoline:
C2I_DISPATCH;
breakout_trampoline_end:
/* here come all the bytecode operations */
do_inc:
++val;
++pc;
do_inc_end:
I2A_DISPATCH;
do_dec:
--val;
++pc;
do_dec_end:
I2A_DISPATCH;
do_mul2:
val *= 2;
++pc;
do_mul2_end:
I2A_DISPATCH;
do_div2:
val /= 2;
++pc;
do_div2_end:
I2A_DISPATCH;
do_prn:
_printf(_print_template, val);
++pc;
do_prn_end:
I2A_DISPATCH;
do_jmp:
/* profiling... */
if (code[pc + 1] < 0) {
++profile_counters[pc];
}
old_pc = pc;
/* ...and actual bytecode semantics */
pc += code[pc + 1];
do_jmp_end:
if (do_jit && profile_counters[old_pc] > 1000) {
compile(code, code_size, pc, old_pc);
do_jit = 0; /* no more jit after the first one! */
}
I2A_DISPATCH;
do_brz:
if (val == 0) {
pc += code[pc + 1];
} else {
pc += 2;
}
do_brz_end:
I2A_DISPATCH;
do_halt:
HALT;
do_halt_end:
do_ret:
return val; /* this MUST be the last statement, or the block will not be contiguous! */
}
/* calculates the maximum size of the binary code blocks
* that implement the semantics of the bytecodes
*/
static void calc_max_template_size(void) {
max_template_size = 0;
for (int i = OP_INC; i < RET; ++i) {
long binary_block_size = exit_bytecode[i] - entry_bytecode[i];
if (binary_block_size > max_template_size) {
max_template_size = binary_block_size;
}
}
/* (to be safe we should also consider the size of
* the trampolines)
*/
}
/* compiles to binary */
void compile(char *code, unsigned int code_size, unsigned int start, unsigned int end) {
#if defined DEBUG
clock_t start_t, end_t;
printf("Compiling...\n");
start_t = clock();
#endif
/* does some basic sanity checks */
if (end > code_size || start >= end) {
printf("Compile - Invalid parameter\n");
return;
}
/* allocates a big enough buffer */
size_t code_buffer_size = (end - start) * sizeof(char) * max_template_size;
char *code_buffer = (char *) mmap(0, code_buffer_size,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (code_buffer == MAP_FAILED) {
printf("Compile - Failed allocation of code buffer\n");
return;
}
/* allocates and initializes the target table */
void* *_tgt_table = malloc(code_size * sizeof(void *));
if (_tgt_table == 0) {
printf("Compile - Failed allocation of target table\n");
munmap((void *) code_buffer, code_buffer_size);
return;
}
for (unsigned long long i = 0; i < code_size; ++i) {
_tgt_table[i] = 0;
}
/* dispatches on source code and copies the
* corresponding blocks in a buffer; at the
* same times builds the target table
*/
char *to = code_buffer;
size_t _compiled_code_size = 0;
for (unsigned long long pc = start; pc < end; ++pc) {
char cur_bytecode = code[pc];
/* updates _tgt_table */
_tgt_table[pc] = to;
/* appends in code_buffer the translation of the bytecode */
COPY_TEMPLATE(bytecode[cur_bytecode]);
/* if the bytecode is jump/brz also appends the trampoline,
* and increments pc by one, to keep into account for the operand
*/
if (cur_bytecode == OP_JMP || cur_bytecode == OP_BRZ) {
COPY_TEMPLATE(jump_trampoline);
++pc;
}
/* if this is the last bytecode and it is not a halt or
* a jump, appends the breakout trampoline
*/
if (pc == end - 1 &&
cur_bytecode != OP_HALT &&
cur_bytecode != OP_JMP &&
cur_bytecode != OP_BRZ) {
COPY_TEMPLATE(breakout_trampoline);
}
}
/* makes the buffer executable */
mprotect(code_buffer, _compiled_code_size, PROT_READ | PROT_EXEC);
/* deletes the old compiled code
* and target table and replaces
* them with the new ones
*/
if (binary != 0) {
munmap((void *) binary, binary_size);
}
binary = code_buffer;
binary_size = code_buffer_size;
compiled_code_size = _compiled_code_size;
if (tgt_table != 0) {
free(tgt_table);
}
tgt_table = _tgt_table;
#if defined DEBUG
end_t = clock();
printf("Compilation time: %f sec\n", ((double) (end_t - start_t)) / CLOCKS_PER_SEC);
printf("Binary:\n");
DUMP_BINARY;
#endif
}
int main(void) {
unsigned int code_size;
char *code;
unsigned int code_start_jit, code_end_jit;
int initval;
#if 0
/* Example 1: random straight-line code (no jumps to avoid
* divergence)
*/
code_size = 10000;
code = malloc(code_size * sizeof(char));
srand(2);
for (unsigned long long i = 0; i < code_size - 1; ++i) {
code[i] = (rand() % 5); /* all bytecodes except jumps and halt */
}
code[code_size - 1] = OP_HALT;
code_start_jit = 0; code_end_jit = code_size;
initval = 0;
#endif
#if 0
/* Example 2: a simple brz instruction */
code_size = 4;
code = malloc(code_size * sizeof(char));
code[0] = OP_BRZ;
code[1] = 3;
code[2] = OP_INC;
code[3] = OP_HALT;
code_start_jit = 0; code_end_jit = code_size;
initval = 0;
#endif
/* Example 3: a long loop, jits the
* loop body
*/
code_size = 19;
code = malloc(code_size * sizeof(char));
code[0] = OP_BRZ;
code[1] = 18; /* goes to halt */
code[2] = OP_DEC;
code[3] = OP_DEC;
code[4] = OP_DEC;
code[5] = OP_DEC;
code[6] = OP_DEC;
code[7] = OP_DEC;
code[8] = OP_DEC;
code[9] = OP_DEC;
code[10] = OP_DEC;
code[11] = OP_DEC;
code[12] = OP_INC;
code[13] = OP_DEC;
code[14] = OP_INC;
code[15] = OP_DEC; /* subtracts 10 to the register at each iteration*/
code[16] = OP_JMP;
code[17] = -16; /* goes to brz */
code[18] = OP_HALT;
code_start_jit = 2; code_end_jit = 15;
initval = 1000000;
#if defined DEBUG
printf("Bytecode:\n");
DUMP_SOURCE;
#endif
clock_t start_t, end_t;
int result;
/* first runs the code with the threaded interpreter... */
start_t = clock();
result = exec(code, code_size, initval, 0);
end_t = clock();
printf("Interpreted execution, result: %d, time: %f sec\n", result, ((double) (end_t - start_t)) / CLOCKS_PER_SEC);
/* ..then reruns with jit enabled */
start_t = clock();
result = exec(code, code_size, initval, 1);
end_t = clock();
printf("Jitted execution, result: %d, time: %f sec\n", result, ((double) (end_t - start_t)) / CLOCKS_PER_SEC);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment