gist:3ef8747868ac126c30f94888425eb56d

yjit についてメモ書き

MRI に組み込まれた JIT 実装
Shopify で Maxime Chevalier-Boisvert さんが中心となり開発された
日本語だと https://techracho.bpsinc.jp/hachi8833/2021_11_02/112909 が詳しい

めちゃ速い

$ for opt in '' '--jit' '--yjit'; do echo "opt": $opt; time ./miniruby $opt -e 'def fib(n); return n if n < 2; fib(n - 1) + fib(n-2); end; fib(35)'; done
opt:

real    0m1.050s
user    0m0.971s
sys     0m0.036s
opt: --jit

real    0m0.755s
user    0m1.121s
sys     0m0.072s
opt: --yjit

real    0m0.354s
user    0m0.263s
sys     0m0.089s

--yjit で何が起きるか

ruby.c の proc_options() 関数でオプションが解釈される。 opt->features にビットがセットされる。

    for (argc--, argv++; argc > 0; argc--, argv++) {
    const char *const arg = argv[0];
    if (!arg || arg[0] != '-' || !arg[1])
        break;

    s = arg + 1;
（略）
      case '-':
        if (!s[1] || (s[1] == '\r' && !s[2])) {
        argc--, argv++;
        goto switch_end;
        }
        s++;
（略）
            if (strcmp("copyright", s) == 0) {
（略）
            }
            else if (strcmp("yjit", s) == 0 || setup_yjit_options(s, &opt->yjit)) {
#if USE_MJIT
                FEATURE_SET(opt->features, FEATURE_BIT(yjit));
#else
                rb_warn("Ruby was built without JIT support");
#endif
            }

USE_MJIT は yjit でも共有しているっぽい。そのうち別々になる？

その後 process_options() で初期化処理が実行される。

    if (opt->features.set & FEATURE_BIT(yjit)) {
#if USE_MJIT
        if (opt->mjit.on) {
            rb_warn("MJIT and YJIT cannot both be enabled at the same time. Exiting");
            exit(1);
        }
#endif
        rb_yjit_init(&opt->yjit);
    }

初期化処理は yjit_iface.c に書かれている。

struct rb_yjit_options rb_yjit_opts;
（略）
void
rb_yjit_init(struct rb_yjit_options *options)
{
    if (!YJIT_SUPPORTED_P || !JIT_ENABLED) {
        return;
    }

    rb_yjit_opts = *options;
    rb_yjit_opts.yjit_enabled = true;
（略）
}

いったんは yjiy_enabled が true になることだけ覚えておけば良い。略した部分は後から重要になるはずなのでそのときに戻ってくる。同じく yjit_iface.c には以下のような定義があり、これで全体に分岐していそう。

bool
rb_yjit_enabled_p(void)
{
    return rb_yjit_opts.yjit_enabled;
}

なお余談だが、jit ないし yjit をサポートしない環境では yjit.c で以下のようにしてスタブ実装で置き換えられ、無視される。

// Check if we need to include YJIT in the build
#if JIT_ENABLED && YJIT_SUPPORTED_P
（略）
#else
（略）
void rb_yjit_init(struct rb_yjit_options *options) {}
（略）
#endif // if JIT_ENABLED && YJIT_SUPPORTED_P

rb_yjit_enabled_p() が呼ばれている箇所を確認する。まずは処理の根幹に関わらない version.c から。description や version 表記で利用されている。

version.c

void
Init_ruby_description(void)
{
    VALUE description;

    if (MJIT_OPTS_ON) {
        description = MKSTR(description_with_mjit);
    }
    else if (rb_yjit_enabled_p()) {
        description = MKSTR(description_with_yjit);
    }
    else {
        description = MKSTR(description);
    }
（略）
}

（略）

void
ruby_show_version(void)
{
    if (MJIT_OPTS_ON) {
        PRINT(description_with_mjit);
    }
    else if (rb_yjit_enabled_p()) {
        PRINT(description_with_yjit);
    }
    else {
        PRINT(description);
    }
（略）
}

mjit.h での使われ方が入口っぽい。

static inline VALUE
mjit_exec(rb_execution_context_t *ec)
{
    const rb_iseq_t *iseq = ec->cfp->iseq;
    struct rb_iseq_constant_body *body = iseq->body;
    bool yjit_enabled = false;
#ifndef MJIT_HEADER
    // Don't want to compile with YJIT or use code generated by YJIT
    // when running inside code generated by MJIT.
    yjit_enabled = rb_yjit_enabled_p();
#endif

とのこと。混ざってしまったら困るのは確かに。mjit_exec() 続き。

    if (mjit_call_p || yjit_enabled) {
        body->total_calls++;
    }

#ifndef MJIT_HEADER
    if (yjit_enabled && !mjit_call_p && body->total_calls == rb_yjit_call_threshold())  {
        // If we couldn't generate any code for this iseq, then return
        // Qundef so the interpreter will handle the call.
        if (!rb_yjit_compile_iseq(iseq, ec)) {
            return Qundef;
        }
    }
#endif

呼び出し回数がしきい値を超えたらそのコードブロックをコンパイルする。このあたりは MJIT で導入された仕組みをほぼそのまま使っていると言ってよさそう。しきい値はちょっと戻って、コマンドライン引数のあたりで決定されている。デフォルトは 10 回。

yjit_iface.h

# define YJIT_DEFAULT_CALL_THRESHOLD 10

yjit_iface.c

unsigned
rb_yjit_call_threshold(void)
{
    return rb_yjit_opts.call_threshold;
}

（略）
void
rb_yjit_init(struct rb_yjit_options *options)
{
（略）
    if (rb_yjit_opts.call_threshold < 1) {
        rb_yjit_opts.call_threshold = YJIT_DEFAULT_CALL_THRESHOLD;
    }
（略）
}

ruby.c

static bool
setup_yjit_options(const char *s, struct rb_yjit_options *yjit_opt)
{
    const char prefix[] = "yjit-";
    if (strncmp(prefix, s, sizeof(prefix)-1) != 0) {
        return false;
    }
    s += sizeof(prefix)-1;
（略）

    if (yjit_opt_match_arg(s, l, "exec-mem-size")) {
（略）
    }
    else if (yjit_opt_match_arg(s, l, "call-threshold")) {
        yjit_opt->call_threshold = atoi(s + 1);
    }
（略）
    return true;
}

再度 mjit_exec() 続き。iseq の body->jit_func にある関数ポインタを取り出して呼び出している。

    if (!(mjit_call_p || yjit_enabled))
        return Qundef;

    RB_DEBUG_COUNTER_INC(mjit_exec);

    mjit_func_t func = body->jit_func;

    // YJIT tried compiling this function once before and couldn't do
    // it, so return Qundef so the interpreter handles it.
    if (yjit_enabled && func == 0) {
        return Qundef;
    }
（略）
    return func(ec, ec->cfp);
}

このあたりもやはり MJIT の仕組みに乗っている。

次は rb_yjit_compile_iseq() を見る。ここだけ見るとシンプル。

bool
rb_yjit_compile_iseq(const rb_iseq_t *iseq, rb_execution_context_t *ec)
{
#if (OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE) && JIT_ENABLED
    bool success = true;
    RB_VM_LOCK_ENTER();
    rb_vm_barrier();

    // Compile a block version starting at the first instruction
    uint8_t *code_ptr = gen_entry_point(iseq, 0, ec);

    if (code_ptr) {
        iseq->body->jit_func = (yjit_func_t)code_ptr;
    }
    else {
        iseq->body->jit_func = 0;
        success = false;
    }

    RB_VM_LOCK_LEAVE();
    return success;
#else
    return false;
#endif
}

gen_entry_point() が実際のコンパイルをしているように見える。 gen_entry_point() もやはり、ここだけ見るとシンプル。

// Generate a block version that is an entry point inserted into an iseq
static uint8_t *
gen_entry_point(const rb_iseq_t *iseq, uint32_t insn_idx, rb_execution_context_t *ec)
{
    // If we aren't at PC 0, don't generate code
    // See yjit_pc_guard
    if (iseq->body->iseq_encoded != ec->cfp->pc) {
        return NULL;
    }

    // The entry context makes no assumptions about types
    blockid_t blockid = { iseq, insn_idx };

    rb_vm_barrier();
    // Write the interpreter entry prologue. Might be NULL when out of memory.
    uint8_t *code_ptr = yjit_entry_prologue(cb, iseq);

    // Try to generate code for the entry block
    block_t *block = gen_block_version(blockid, &DEFAULT_CTX, ec);

    cb_mark_all_executable(ocb);
    cb_mark_all_executable(cb);

    // If we couldn't generate any code
    if (!block || block->end_idx == insn_idx) {
        return NULL;
    }

    return code_ptr;
}

iseq 先頭からのコンパイルのみ許容し、Ractor 対策で rb_vm_barrier() し、entry point を作成し、コード生成して実行可能属性をつける、というように読める。呼び出し元の rb_yjit_compile_iseq() からは insn_idx として 0 が渡されていたことを覚えておく。 cb_mark_all_executable() は mprotect(cb->mem_block_, cb->mem_size, PROT_READ | PROT_EXEC) する程度のものなので省略。

ここで、cb と ocb について。省略した rb_yjit_init() の初期化処理の yjit_init_codegen() で、cb には static codeblock_t block へのポインタ、ocb には static codeblock_t outline_block へのポインタがそれぞれセットされる。この構造体は以下のようになっている。

// Block of executable memory into which instructions can be written
typedef struct CodeBlock
{
    // Memory block
    // Users are advised to not use this directly.
    uint8_t *mem_block_;

    // Memory block size
    uint32_t mem_size;

    // Current writing position
    uint32_t write_pos;

    // Table of registered label addresses
    uint32_t label_addrs[MAX_LABELS];

    // Table of registered label names
    // Note that these should be constant strings only
    const char *label_names[MAX_LABELS];

    // References to labels
    labelref_t label_refs[MAX_LABEL_REFS];

    // Number of labels registeered
    uint32_t num_labels;

    // Number of references to labels
    uint32_t num_refs;


    // Keep track of the current aligned write position.
    // Used for changing protection when writing to the JIT buffer
    uint32_t current_aligned_write_pos;

    // Set if the assembler is unable to output some instructions,
    // for example, when there is not enough space or when a jump
    // target is too far away.
    bool dropped_bytes;

    // Flag to enable or disable comments
    bool has_asm;


} codeblock_t;

コメントがついていてわかりやすい。だからわかるというわけでもないが。

yjit_entry_prologue() に戻る。ASSERT や残り容量確認などを省くと以下のようになる。

/*
Compile an interpreter entry block to be inserted into an iseq
Returns `NULL` if compilation fails.
*/
static uint8_t *
yjit_entry_prologue(codeblock_t *cb, const rb_iseq_t *iseq)
{
（略）

    // Align the current write position to cache line boundaries
    cb_align_pos(cb, 64);

    uint8_t *code_ptr = cb_get_ptr(cb, cb->write_pos);
    ADD_COMMENT(cb, "yjit entry");

    push(cb, REG_CFP);
    push(cb, REG_EC);
    push(cb, REG_SP);

cb_align_pos() は 64 バイト境界に合わせてパディングする。uint8_t *cb_get_ptr(const codeblock_t *cb, uint32_t index) は十分な空きがあれば &cb->mem_block_[index] を返す。

push(cb, REG_CFP) から、埋め込みアセンブラのようになってくる。xbyak に慣れていたらわかりやすい。

#define REG_CFP R13 と static const x86opnd_t R13 = { OPND_REG, 64, .as.reg = { REG_GP, 13 }}; という定義から、yjit では ruby の CFP を amd64 の R13 に割り当てているのだろう、ということがわかる。

push() 全体を見る。

/// push - Push an operand on the stack
void push(codeblock_t *cb, x86opnd_t opnd)
{
    assert (opnd.num_bits == 64);

    //cb.writeASM("push", opnd);

    if (opnd.type == OPND_REG) {
      if (rex_needed(opnd))
          cb_write_rex(cb, false, 0, 0, opnd.as.reg.reg_no);
      cb_write_opcode(cb, 0x50, opnd);
    }
    else if (opnd.type == OPND_MEM) {
      cb_write_rm(cb, false, false, NO_OPND, opnd, 6, 1, 0xFF);
    }
    else {
      assert(false && "unexpected operand type");
    }
}

push 対象がレジストリであれば、PUSH EAX は 0x50、PUSH EBX は 0x51 のように機械語に変換する。このとき、64 ビットの拡張汎用レジスタが対象であれば、REX プレフィックスをつける。対象がメモリであれば、ModR/M バイトを利用して変換。この部分、よくわかっていないし大変そうなので省略。必要があればまた。 cb_write_opcode() は cb_write_byte() の薄いラッパー。cb_write_byte() は以下の通り。

// Write a byte at the current position
void cb_write_byte(codeblock_t *cb, uint8_t byte)
{
    assert (cb->mem_block_);
    if (cb->write_pos < cb->mem_size) {
        cb_mark_position_writeable(cb, cb->write_pos);
        cb->mem_block_[cb->write_pos] = byte;
        cb->write_pos++;
    }
    else {
        cb->dropped_bytes = true;
    }
}

チェックなどをなくすと、やっていることは cb->mem_block_[cb->write_pos++] = byte に帰結する。

本来であれば gen_entry_point() の gen_block_version() に戻るところだが、このあたりでそろそろ lazy BBV の lazy な処理が見てみたい。 yjit_reg_op(BIN(branchif), gen_branchif) あたりから見ていく。

まず yjit_reg_op(int opcode, codegen_fn gen_fn) は、やっていることは gen_fns[opcode] = gen_fn; つまり RubyVM のオペコードに対応する関数ポインタを紐付けて記録するだけ。 gen_fns に登録された関数は gen_single_block() の中で呼び出される。

gen_branchif() を見る。要約するとこうなっている。

static codegen_status_t
gen_branchif(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
{
（略）

    // Test if any bit (outside of the Qnil bit) is on
    // RUBY_Qfalse  /* ...0000 0000 */
    // RUBY_Qnil    /* ...0000 1000 */
    x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
    test(cb, val_opnd, imm_opnd(~Qnil));

（略）

    // Generate the branch instructions
    gen_branch(
        jit,
        ctx,
        jump_block,
        ctx,
        next_block,
        ctx,
        gen_branchif_branch
    );

    return YJIT_END_BLOCK;
}

スタックから 1 オブジェクト取り出して偽かどうかを TEST で確認する。そのあと、gen_branch() で生成したコードに続く。現在の jit の状況、分岐前・分岐後それぞれのコンテキスト、分岐後のブロック、などを gen_branch() に渡す。このとき、gen_branchif_branch() なる関数のポインタも渡している。 gen_branch() はそんなに大きくなく、また重要なので全体を引用する。

static void
gen_branch(
    jitstate_t *jit,
    const ctx_t *src_ctx,
    blockid_t target0,
    const ctx_t *ctx0,
    blockid_t target1,
    const ctx_t *ctx1,
    branchgen_fn gen_fn
)
{
    RUBY_ASSERT(target0.iseq != NULL);

    branch_t *branch = make_branch_entry(jit->block, src_ctx, gen_fn);
    branch->targets[0] = target0;
    branch->targets[1] = target1;
    branch->target_ctxs[0] = *ctx0;
    branch->target_ctxs[1] = ctx1? *ctx1:DEFAULT_CTX;

    // Get the branch targets or stubs
    branch->dst_addrs[0] = get_branch_target(target0, ctx0, branch, 0);
    branch->dst_addrs[1] = ctx1? get_branch_target(target1, ctx1, branch, 1):NULL;

    // Call the branch generation function
    branch->start_addr = cb_get_write_ptr(cb);
    regenerate_branch(cb, branch);
}

make_branch_entry() も重要ではあるけれど、ここではざっくり「branch_t のコンストラクタ」程度の認識に留める。分岐を表す branch_t を初期化した上で、get_branch_target() で分岐先を生成。

// Get a version or stub corresponding to a branch target
static uint8_t *
get_branch_target(
    blockid_t target,
    const ctx_t *ctx,
    branch_t *branch,
    uint32_t target_idx
)
{
    //fprintf(stderr, "get_branch_target, block (%p, %d)\n", target.iseq, target.idx);

    block_t *p_block = find_block_version(target, ctx);

（略）

    // Generate an outlined stub that will call branch_stub_hit()
    uint8_t *stub_addr = cb_get_ptr(ocb, ocb->write_pos);

    // Call branch_stub_hit(branch_idx, target_idx, ec)
    mov(ocb, C_ARG_REGS[2], REG_EC);
    mov(ocb, C_ARG_REGS[1], imm_opnd(target_idx));
    mov(ocb, C_ARG_REGS[0], const_ptr_opnd(branch));
    call_ptr(ocb, REG0, (void *)&branch_stub_hit);

    // Jump to the address returned by the
    // branch_stub_hit call
    jmp_rm(ocb, RAX);

（略）

    return stub_addr;
}

なお #define C_ARG_REGS ( (x86opnd_t[]){ RDI, RSI, RDX, RCX, R8, R9 } ) というようになっていて、これは System V AMD64 ABI に準拠している想定で C の関数呼び出しに使っている。（ところで、今まで使っていた cb ではなく ocb に書き込んでいるのは興味深い）要するに生成したコードから関数 branch_stub_hit() を呼び出すようにしている。次は branch_stub_hit() を見る。

// Called by the generated code when a branch stub is executed
// Triggers compilation of branches and code patching
static uint8_t *
branch_stub_hit(branch_t *branch, const uint32_t target_idx, rb_execution_context_t *ec)
{
    uint8_t *dst_addr = NULL;

（略）
    blockid_t target = branch->targets[target_idx];
    const ctx_t *target_ctx = &branch->target_ctxs[target_idx];

    // If this branch has already been patched, return the dst address
    // Note: ractors can cause the same stub to be hit multiple times
    if (branch->blocks[target_idx]) {
        dst_addr = branch->dst_addrs[target_idx];
    }
    else {
（後述）
    }

（略）

    // Return a pointer to the compiled block version
    return dst_addr;
}

コメントが残されていてわかりやすい。だいたい書いてあるとおりで、すでに分岐先ブロックが生成済みならそのアドレスをそのまま使い、まだ生成してないなら生成する。ここで引数 ec が渡されているということは、Ruby コード実行中に呼び出されること、実行時コンテキストを使う前提になっていることがわかる。JIT だから当然とも言えるが、今までと比べてもより実行時の状態を気にしている処理となっている。続き。

        // :stub-sp-flush:
        // Generated code do stack operations without modifying cfp->sp, while the
        // cfp->sp tells the GC what values on the stack to root. Generated code
        // generally takes care of updating cfp->sp when it calls runtime routines that
        // could trigger GC, but it's inconvenient to do it before calling this function.
        // So we do it here instead.
        VALUE *const original_interp_sp = ec->cfp->sp;
        ec->cfp->sp += target_ctx->sp_offset;

        // Update the PC in the current CFP, because it
        // may be out of sync in JITted code
        ec->cfp->pc = yjit_iseq_pc_at_idx(target.iseq, target.idx);

生成コードでは SP をいちいち操作しないがそれだと GC 対象が変わるので、一気に offset だけ確保しておいて後に戻す。PC も是正する。続き。

        // Try to find an existing compiled version of this block
        block_t *p_block = find_block_version(target, target_ctx);

        // If this block hasn't yet been compiled
        if (!p_block) {
（略）
            // Compile the new block version
            p_block = gen_block_version(target, target_ctx, ec);
（略）
        }

ブロックがまだ作成されていなければここで作成する。なおここでいうブロックは Ruby のブロックではなく BBV での生成コードのひとかたまりを指すブロック。今後も特に説明なく「ブロック」と書いたらこれのことを指す。 if (cb_get_write_ptr(cb) == branch->end_addr) { ... } のあたりは、効率的なコード生成のためのテクニックっぽいのでとりあえず省略。

        if (p_block) {
（略）
            // Update the branch target address
            dst_addr = p_block->start_addr;
            branch->dst_addrs[target_idx] = dst_addr;

            // Mark this branch target as patched (no longer a stub)
            branch->blocks[target_idx] = p_block;

            // Rewrite the branch with the new jump target address
            regenerate_branch(cb, branch);

            // Restore interpreter sp, since the code hitting the stub expects the original.
            ec->cfp->sp = original_interp_sp;
        }
        else {
            // Failed to service the stub by generating a new block so now we
            // need to exit to the interpreter at the stubbed location. We are
            // intentionally *not* restoring original_interp_sp. At the time of
            // writing, reconstructing interpreter state only involves setting
            // cfp->sp and cfp->pc. We set both before trying to generate the
            // block. All there is left to do to exit is to pop the native
            // frame. We do that in code_for_exit_from_stub.
            dst_addr = code_for_exit_from_stub;
        }

        cb_mark_all_executable(ocb);
        cb_mark_all_executable(cb);
}

だいたい書いてあるとおりで、ブロックが作成成功したりあるいはすでに存在したら branch->dst_addrs と branch->blocks にセットして、ブランチを上書きして、退避していた SP をもとに戻す。ブロックが作れなかった場合は code_for_exit_from_stub という、復帰用のコードを代わりに使う。なお、code_for_exit_from_stub は初期化処理で以下のように作成される。

yjit_codegen.c

// Fill code_for_exit_from_stub. This is used by branch_stub_hit() to exit
// to the interpreter when it cannot service a stub by generating new code.
// Before coming here, branch_stub_hit() takes care of fully reconstructing
// interpreter state.
static void
gen_code_for_exit_from_stub(void)
{
    codeblock_t *cb = ocb;
    code_for_exit_from_stub = cb_get_ptr(cb, cb->write_pos);

    GEN_COUNTER_INC(cb, exit_from_branch_stub);

    pop(cb, REG_SP);
    pop(cb, REG_EC);
    pop(cb, REG_CFP);

    mov(cb, RAX, imm_opnd(Qundef));
    ret(cb);
}

まとめると、「分岐実行で今回入らない分岐については、『コード生成する関数を呼び出す』というコードを stub として生成しておき、その後で分岐に入ってきたら先程の『コード生成する関数を呼び出す』という処理がなされる」というように読める。

TODO

ブロックバージョンの更新や選択
ocb を使う箇所とその理由
make_branch_entry()
gen_block_version()
rb_yjit_init()
- 適宜確認
- いきなり出てきた覚えのないグローバル変数などはおそらくここでセットしている
ちゃんとした文章化
- 調べながら文章を考えると効率が悪そうなので後回し
- たぶんよほどやる気にならないとメモのまま終わる

wanabe/gist:3ef8747868ac126c30f94888425eb56d