-
-
Save tenderworks/f4cbb60f2c0dc3ab334eb73fec36f702 to your computer and use it in GitHub Desktop.
# frozen_string_literal: true | |
# For now, run like this: `ruby --rjit --rjit-disable fjit.rb` | |
# | |
# Once RJIT is removed, the extra flags will not be necessary | |
require "fiddle" | |
require "ffi" | |
require "jit_buffer" | |
require "hacks" | |
require "aarch64" | |
require "strlen" | |
require "benchmark/ips" | |
module FJIT | |
C = RubyVM::RJIT.const_get(:C) | |
include AArch64::Registers | |
def read_ptr ptr, offset | |
Fiddle::Pointer.new(ptr)[offset, Fiddle::SIZEOF_VOIDP].unpack1("l!") | |
end | |
def loadi asm, out, num | |
i = 0 | |
while num > 0 | |
if i == 0 | |
asm.movz out, num & 0xFFFF, lsl: 0 | |
else | |
asm.movk out, num & 0xFFFF, lsl: (i * 16) | |
end | |
i += 1 | |
num >>= 16 | |
end | |
end | |
def attach_function name, params, ret | |
params = params.map { "_" }.join(", ") | |
class_eval "def self.#{name}(#{params}); end" | |
m = method(name) | |
rb_iseq = RubyVM::InstructionSequence.of(m) | |
# Get the pointer to the iseq obj | |
addr = Fiddle.dlwrap(rb_iseq) | |
offset = Hacks::STRUCTS["RTypedData"]["data"][0] | |
addr = read_ptr(read_ptr(addr, offset), 0) | |
iseq_t = C.rb_iseq_t.new addr | |
asm = AArch64::Assembler.new | |
# X0 has the ec, x1 has the CFP | |
# save x0 and X1 on the stack | |
asm.stp X0, X1, [SP, -16], :! | |
# save X30 (the branch link reg) | |
asm.stp X29, X30, [SP, -16], :! | |
# SP is in X0 | |
asm.ldr X0, [X1, C.rb_control_frame_t.offsetof(:sp)] | |
# Put top of stack in X0 | |
asm.sub(X0, X0, (4 * 8)) | |
# Get the underlying string pointer | |
loadi(asm, X2, Fiddle::Handle::DEFAULT["rb_string_value_cstr"]) | |
asm.blr X2 | |
# Call the function | |
loadi(asm, X2, Fiddle::Handle::DEFAULT[name.to_s]) | |
asm.blr X2 | |
asm.ldp X29, X30, [SP], 16 | |
case ret | |
when :int | |
# convert to int | |
asm.lsl(X0, X0, 1) | |
asm.add(X0, X0, 1) | |
else | |
raise ArgumentError, "unknown type #{ret}" | |
end | |
# restore X0 and X1, but in to X1 and X2 to avoid mov | |
asm.ldp X1, X2, [SP], 16 | |
# pop frame | |
asm.add(X2, X2, C.rb_control_frame_t.size) | |
asm.stur(X2, [X1, C.rb_execution_context_t.offsetof(:cfp)]) | |
asm.ret | |
jit = JITBuffer.new 1024 | |
jit.writeable! | |
asm.write_to jit | |
jit.executable! | |
iseq_t.body.jit_entry = jit.to_i | |
end | |
end | |
module A | |
extend FFI::Library | |
ffi_lib 'c' | |
attach_function :strlen, [:string], :int | |
end | |
module B | |
def self.strlen x | |
x.bytesize | |
end | |
end | |
module C | |
extend FJIT | |
attach_function :strlen, [:string], :int | |
end | |
str = "foo" | |
Benchmark.ips do |x| | |
x.report("strlen-ffi") { A.strlen(str) } | |
x.report("strlen-ruby") { B.strlen(str) } | |
x.report("strlen-cext") { Strlen.strlen(str) } | |
x.report("ruby-direct") { str.bytesize } | |
x.report("strlen-fjit") { C.strlen(str) } | |
x.compare! | |
end |
Right, calculating the strlen of a short string (or simply reading a field) are not the bottleneck being demonstrated in this benchmark. This benchmark is meant to demonstrate frame push and pop overhead (including Ruby value to native conversion):
- Ruby to Ruby (
B.strlen
) - Ruby to native via C extension (
Strlen.strlen
) - Ruby to native via FFI (
A.strlen
) - Ruby to native via JIT code (
C.strlen
)
str.bytesize
is relevant in that it gives a baseline value for comparison (that probably can't be beaten as it only pushes one frame). Each type of frame push / pop has different overhead.
Hi, I though you might be interested to look at my project, I did a similar benchmark. My solution isn't as fast as yours, but it's still ~2x faster that a regular FFI and it's compatible with older rubies and potentially with all FFI's types and data converters (UPD: I added forgotten inline attributes and now it's a bit faster than cext)
BTW
str.bytesize
isn't really relevant here, as it doesn't performstrlen
, it returns a precomputed value:I mean,
strlen-ruby
andstrlen-cext
comparison results depend on str length, as you gem does performstrlen