Last active
August 7, 2023 19:54
-
-
Save kddnewton/09830f563682d3cd8672c64ce5b7016e to your computer and use it in GitHub Desktop.
Faster JSON parser in Ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
require "bundler/inline" | |
gemfile do | |
source "https://rubygems.org" | |
gem "json_pure" | |
gem "benchmark-ips" | |
end | |
require "json/pure" | |
require "benchmark/ips" | |
class FastJSON < StringScanner | |
IGNORE = %r( | |
(?: | |
//[^\n\r]*[\n\r]| # line comments | |
/\* # c-style comments | |
(?: | |
[^*/]| # normal chars | |
/[^*]| # slashes that do not start a nested comment | |
\*[^/]| # asterisks that do not end this comment | |
/(?=\*/) # single slash before this comment's end | |
)* | |
\*/ # the End of this comment | |
|[ \t\r\n]+ # whitespaces: space, horizontal tab, lf, cr | |
)* | |
)mx | |
def initialize(source, opts = {}) | |
if source.respond_to?(:to_str) | |
source = source.to_str | |
else | |
raise TypeError, "#{source.inspect} is not like a string" | |
end | |
if source.encoding != ::Encoding::ASCII_8BIT | |
source = source.encode(::Encoding::UTF_8) | |
source.force_encoding(::Encoding::ASCII_8BIT) | |
end | |
super(source) | |
opts ||= {} | |
@array_class = opts[:array_class] || Array | |
@decimal_class = opts[:decimal_class] | |
@object_class = opts[:object_class] || Hash | |
@allow_nan = !!opts[:allow_nan] | |
@freeze = !!opts[:freeze] | |
@symbolize_names = !!opts[:symbolize_names] | |
@current_nesting = 0 | |
@max_nesting = opts.key?(:max_nesting) ? (opts[:max_nesting] || 0) : 100 | |
@create_additions = opts.key?(:create_additions) ? !!opts[:create_additions] : false | |
@create_id = @create_additions ? JSON.create_id : nil | |
@match_string = opts[:match_string] | |
if @symbolize_names && @create_additions | |
raise ArgumentError, "options :symbolize_names and :create_additions cannot be used in conjunction" | |
end | |
end | |
def parse | |
skip(IGNORE) | |
value = parse_item | |
skip(IGNORE) | |
raise JSON::ParserError, "unexpected tokens after value" unless eos? | |
value | |
end | |
private | |
def parse_item | |
case | |
when scan(/"((?:[^\x0-\x1f"\\]|\\["\\\/bfnrt]|\\u[0-9a-fA-F]{4}|\\[\x20-\x21\x23-\x2e\x30-\x5b\x5d-\x61\x63-\x65\x67-\x6d\x6f-\x71\x73\x75-\xff])*)"/n) | |
string = self[1] | |
return string if string.empty? | |
string.gsub!(%r{(?:\\[\\bfnrt"/]|(?:\\u(?:[A-Fa-f\d]{4}))+|\\[\x20-\xff])}n) do |c| | |
case c[1].to_sym | |
when :b | |
"\b" | |
when :f | |
"\f" | |
when :n | |
"\n" | |
when :r | |
"\r" | |
when :t | |
"\t" | |
when :u # \uXXXX | |
bytes = String.new(encoding: Encoding::ASCII_8BIT) | |
i = 0 | |
while c[i] == "\\" && c[i + 1] == "u" | |
bytes << c[i + 2, 2].to_i(16) << c[i + 4, 2].to_i(16) | |
i += 6 | |
end | |
JSON.iconv("utf-8", "utf-16be", bytes).force_encoding(::Encoding::ASCII_8BIT) | |
else | |
c[1] | |
end | |
end | |
string.force_encoding(::Encoding::UTF_8) | |
string = -string if @freeze | |
if @create_additions and @match_string | |
@match_string.each do |pattern, clazz| | |
return clazz.json_create(string) if clazz.json_creatable? && string =~ pattern | |
end | |
end | |
string | |
when skip(/\{/) | |
@current_nesting += 1 | |
if @max_nesting != 0 && @current_nesting > @max_nesting | |
raise JSON::NestingError, "nesting of #{@current_nesting} is too deep" | |
end | |
values = @object_class.new | |
skip(IGNORE) | |
if skip(/\}/) | |
@current_nesting -= 1 | |
return values | |
end | |
while true | |
key = parse_item | |
raise JSON::ParserError, "expected a string key" unless key.is_a?(String) | |
skip(IGNORE) | |
key = key.to_sym if @symbolize_names | |
raise JSON::ParserError, "expected a ':' to follow the string key" unless skip(/:/) | |
skip(IGNORE) | |
values[key] = parse_item | |
skip(IGNORE) | |
case | |
when skip(/\}/) | |
@current_nesting -= 1 | |
if @create_additions && (clazz_name = values[@create_id]) | |
clazz = JSON.deep_const_get(clazz_name) | |
values = clazz.json_create(values) if clazz && clazz.json_creatable? | |
end | |
return values | |
when skip(/,/) | |
skip(IGNORE) | |
else | |
raise JSON::ParserError, "expected ',' or '}' after object value" | |
end | |
end | |
when skip(/\[/) | |
@current_nesting += 1 | |
if @max_nesting != 0 && @current_nesting > @max_nesting | |
raise JSON::NestingError, "nesting of #{@current_nesting} is too deep" | |
end | |
values = @array_class.new | |
skip(IGNORE) | |
if skip("]") | |
@current_nesting -= 1 | |
return values | |
end | |
while true | |
values << parse_item | |
skip(IGNORE) | |
case | |
when skip("]") | |
@current_nesting -= 1 | |
return values | |
when skip(",") | |
skip(IGNORE) | |
else | |
raise JSON::ParserError, "expected ',' or ']' after array value" | |
end | |
end | |
when skip("true") | |
true | |
when skip("false") | |
false | |
when skip("null") | |
nil | |
when scan(/-?(?:0|[1-9]\d*)((?:\.\d+)?(?:[Ee][-+]?\d+)?)?/) | |
if !self[1] | |
Integer(self[0]) | |
elsif @decimal_class | |
if @decimal_class == BigDecimal | |
BigDecimal(self[0]) | |
else | |
@decimal_class.new(self[0]) || Float(self[0]) | |
end | |
else | |
Float(self[0]) | |
end | |
when @allow_nan && skip("NaN") | |
NaN | |
when @allow_nan && scan(/(-?)Infinity/) | |
self[1] ? -Float::INFINITY : Float::INFINITY | |
else | |
raise JSON::ParserError, "unexpected token at #{pos}: '#{peek(1)}'" | |
end | |
end | |
end | |
class FasterJSON < StringScanner | |
def initialize(source, opts = {}) | |
if source.respond_to?(:to_str) | |
source = source.to_str | |
else | |
raise TypeError, "#{source.inspect} is not like a string" | |
end | |
if source.encoding != ::Encoding::ASCII_8BIT | |
source = source.encode(::Encoding::UTF_8) | |
source.force_encoding(::Encoding::ASCII_8BIT) | |
end | |
super(source) | |
opts ||= {} | |
@symbolize_names = !!opts[:symbolize_names] | |
@current_nesting = 0 | |
@max_nesting = opts.key?(:max_nesting) ? (opts[:max_nesting] || 0) : 100 | |
end | |
def parse | |
skip(/\s*/) | |
value = parse_item | |
skip(/\s*/) | |
raise JSON::ParserError, "unexpected tokens after value" unless eos? | |
value | |
end | |
private | |
def parse_item | |
case | |
when scan(/"((?:[^\x0-\x1f"\\]|\\["\\\/bfnrt]|\\u[0-9a-fA-F]{4}|\\[\x20-\x21\x23-\x2e\x30-\x5b\x5d-\x61\x63-\x65\x67-\x6d\x6f-\x71\x73\x75-\xff])*)"/n) | |
string = self[1] | |
return string if string.empty? | |
string.gsub!(%r{(?:\\[\\bfnrt"/]|(?:\\u(?:[A-Fa-f\d]{4}))+|\\[\x20-\xff])}n) do |c| | |
case c[1].to_sym | |
when :b | |
"\b" | |
when :f | |
"\f" | |
when :n | |
"\n" | |
when :r | |
"\r" | |
when :t | |
"\t" | |
when :u # \uXXXX | |
bytes = String.new(encoding: Encoding::ASCII_8BIT) | |
i = 0 | |
while c[i] == "\\" && c[i + 1] == "u" | |
bytes << c[i + 2, 2].to_i(16) << c[i + 4, 2].to_i(16) | |
i += 6 | |
end | |
JSON.iconv("utf-8", "utf-16be", bytes).force_encoding(::Encoding::ASCII_8BIT) | |
else | |
c[1] | |
end | |
end | |
string.force_encoding(::Encoding::UTF_8) | |
string | |
when skip("{") | |
@current_nesting += 1 | |
if @max_nesting != 0 && @current_nesting > @max_nesting | |
raise JSON::NestingError, "nesting of #{@current_nesting} is too deep" | |
end | |
values = {} | |
skip(/\s*/) | |
if skip("}") | |
@current_nesting -= 1 | |
return values | |
end | |
while true | |
key = parse_item | |
key = key.to_sym if @symbolize_names | |
raise JSON::ParserError, "expected a string key" unless key.is_a?(String) | |
raise JSON::ParserError, "expected a ':' to follow the string key" unless skip(/\s*:\s*/) | |
values[key] = parse_item | |
skip(/\s*/) | |
case | |
when skip(",") | |
skip(/\s*/) | |
when skip("}") | |
@current_nesting -= 1 | |
return values | |
else | |
raise JSON::ParserError, "expected ',' or '}' after object value" | |
end | |
end | |
when skip("[") | |
@current_nesting += 1 | |
if @max_nesting != 0 && @current_nesting > @max_nesting | |
raise JSON::NestingError, "nesting of #{@current_nesting} is too deep" | |
end | |
values = [] | |
skip(/\s*/) | |
if skip("]") | |
@current_nesting -= 1 | |
return values | |
end | |
while true | |
values << parse_item | |
skip(/\s*/) | |
case | |
when skip(",") | |
skip(/\s*/) | |
when skip("]") | |
@current_nesting -= 1 | |
return values | |
else | |
raise JSON::ParserError, "expected ',' or ']' after array value" | |
end | |
end | |
when skip("true") | |
true | |
when skip("false") | |
false | |
when skip("null") | |
nil | |
when scan(/-?(?:0|[1-9]\d*)((?:\.\d+)?(?:[Ee][-+]?\d+)?)?/) | |
self[1] ? Float(self[0]) : Integer(self[0]) | |
else | |
raise JSON::ParserError, "unexpected token at #{pos}: '#{peek(1)}'" | |
end | |
end | |
end | |
source = JSON.pretty_generate([true, false, nil, 3, "hi\u0000", [], ["a", "b", "c"], {}, { "a" => 1, "b" => 2, "c" => 3 }]) | |
raise if JSON::Pure::Parser.new(source).parse != FastJSON.new(source).parse | |
raise if JSON::Pure::Parser.new(source).parse != FasterJSON.new(source).parse | |
Benchmark.ips do |x| | |
x.report("json_pure") { JSON::Pure::Parser.new(source).parse } | |
x.report("fast_json") { FastJSON.new(source).parse } | |
x.report("faster_json") { FasterJSON.new(source).parse } | |
x.compare! | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Warming up -------------------------------------- | |
json_pure 4.302k i/100ms | |
fast_json 8.027k i/100ms | |
faster_json 10.110k i/100ms | |
Calculating ------------------------------------- | |
json_pure 42.378k (± 2.1%) i/s - 215.100k in 5.078163s | |
fast_json 78.217k (± 4.1%) i/s - 393.323k in 5.038024s | |
faster_json 97.307k (± 3.5%) i/s - 495.390k in 5.097498s | |
Comparison: | |
faster_json: 97306.6 i/s | |
fast_json: 78216.6 i/s - 1.24x slower | |
json_pure: 42377.7 i/s - 2.30x slower |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results for TruffleRuby and 3.2.0 YJIT on my machine (without cpufreq/boost and with performance governor):
So significantly faster than
json_pure
for both, great!