Last active
October 10, 2024 05:01
-
-
Save tenderlove/e9bb912648a3d2bce00c4f60bc632a10 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "strscan" | |
class Lexer | |
IDENTIFIER = /[_A-Za-z][_0-9A-Za-z]*\b/ | |
IGNORE = %r{ | |
(?: | |
[, \c\r\n\t]+ | | |
\#.*$ | |
)* | |
}x | |
INT = /[-]?(?:[0]|[1-9][0-9]*)/ | |
FLOAT_DECIMAL = /[.][0-9]+/ | |
FLOAT_EXP = /[eE][+-]?[0-9]+/ | |
FLOAT = /#{INT}#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP}/ | |
KEYWORDS = [ "on", "fragment", "true", "false", "null", "query", "mutation", | |
"subscription", "schema", "scalar", "type", "extend", "implements", | |
"interface", "union", "enum", "input", "directive", "repeatable" | |
].freeze | |
KW_RE = /#{Regexp.union(KEYWORDS.sort)}\b/ | |
module Literals | |
LCURLY = '{' | |
RCURLY = '}' | |
LPAREN = '(' | |
RPAREN = ')' | |
LBRACKET = '[' | |
RBRACKET = ']' | |
COLON = ':' | |
VAR_SIGN = '$' | |
DIR_SIGN = '@' | |
EQUALS = '=' | |
BANG = '!' | |
PIPE = '|' | |
AMP = '&' | |
end | |
ELLIPSIS = '...' | |
PUNCTUATION_TABLE = Literals.constants.each_with_object([]) { |n, o| | |
o[Literals.const_get(n).ord] = n | |
} | |
def initialize doc | |
@doc = doc | |
@scan = StringScanner.new doc | |
end | |
def next_token | |
@scan.skip(IGNORE) | |
return if @scan.eos? | |
case | |
when tok = PUNCTUATION_TABLE[@doc.getbyte(@scan.pos)] then | |
# If the byte at the current position is inside our lookup table, push | |
# the scanner position forward 1 and return the token | |
@scan.pos += 1 | |
tok | |
when len = @scan.skip(KW_RE) then | |
# Return early if uniquely identifiable via length | |
return :ON if len == 2 | |
return :SUBSCRIPTION if len == 12 | |
# Get the position of the start of the keyword | |
start = @scan.pos - len | |
# Get the 2nd and 3rd byte of the keyword and combine to a 16 bit int | |
key = (@doc.getbyte(start + 2) << 8) | @doc.getbyte(start + 1) | |
KW_TABLE[_hash(key)] | |
when @scan.skip(IDENTIFIER) then :IDENTIFIER | |
when @scan.skip(ELLIPSIS) then :ELLIPSIS | |
when @scan.skip(FLOAT) then :FLOAT | |
when @scan.skip(INT) then :INT | |
else | |
@scan.getch | |
:UNKNOWN_CHAR | |
end | |
end | |
KW_TABLE = [:INTERFACE, :MUTATION, :EXTEND, :FALSE, :ENUM, :TRUE, :NULL, | |
nil, nil, nil, nil, nil, nil, nil, :QUERY, nil, nil, :REPEATABLE, | |
:IMPLEMENTS, :INPUT, :TYPE, :SCHEMA, nil, nil, nil, :DIRECTIVE, | |
:UNION, nil, nil, :SCALAR, nil, :FRAGMENT] | |
def _hash key | |
(key * 18592990) >> 27 & 0x1f | |
end | |
end | |
require "benchmark/ips" | |
def allocations | |
x = GC.stat(:total_allocated_objects) | |
yield | |
GC.stat(:total_allocated_objects) - x | |
end | |
def go doc | |
lexer = Lexer.new doc | |
while tok = lexer.next_token | |
# do nothing | |
end | |
end | |
if $0 == __FILE__ | |
doc = ARGF.read | |
Benchmark.ips { |x| x.report { go doc } } | |
p ALLOCATIONS: allocations { go doc } | |
p ALLOCATIONS: allocations { go doc } | |
end |
@Palaolitico wow this is great! Thank you!! 😍
Do you mind sending a PR upstream? I'd be happy to merge.
Yes, of course. Here you have it.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @tenderlove.
Nice post!
Because all keywords are lexically identifiers, I did some experiments to check if processing keywords and identifiers together would provide some performance gains. And in fact, I found an approach that gives a 10% improvement in the upstream benchmark
The idea is to use masks instead of sequential codes for lead characters:
And consider keywords after capturing an identifier, doing some tricks to avoid matching with
KW_RE
as much as possible:KW_FP3
is a precomputed map with fingerprints of the bytes 1, 2 and 3 of all keywords (excepton
) to have a fast check to avoid checking forKW_RE
most of the times, and also to find out the keyword token.Enabling YJIT, performance goes from
to
Regards!