Skip to content

Instantly share code, notes, and snippets.

@thecere
Last active March 24, 2018 18:52
Show Gist options
  • Save thecere/a52e7a9ab9a3e3878c08 to your computer and use it in GitHub Desktop.
Save thecere/a52e7a9ab9a3e3878c08 to your computer and use it in GitHub Desktop.
scite js/jsx lua lexer
-- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
-- JavaScript LPeg lexer.
local l = require('lexer')
local token, word_match = l.token, l.word_match
local P, R, S = lpeg.P, lpeg.R, lpeg.S
local M = {_NAME = 'javascript'}
-- Whitespace.
local ws = token(l.WHITESPACE, l.space^1)
-- Comments.
local line_comment = '//' * l.nonnewline_esc^0
local block_comment = '/*' * (l.any - '*/')^0 * P('*/')^-1
local comment = token(l.COMMENT, line_comment + block_comment)
-- Strings.
local sq_str = l.delimited_range("'")
local dq_str = l.delimited_range('"')
local xq_str = l.delimited_range('`')
local regex_str = #P('/') * l.last_char_includes('+-*%^!=&|?:;,([{<>') *
l.delimited_range('/', true) * S('igm')^0
local simplestring = token(l.STRING, sq_str + dq_str) + token(l.REGEX, regex_str)
local templstring = token('string_template', xq_str) + token(l.REGEX, regex_str)
local string = simplestring + templstring
-- Numbers.
local float_int = S('-')^-1 * R('09')^1 * (P('.') * R('09')^0 * (S('Ee') * l.integer)^-1)^-1
local number = token(l.NUMBER, float_int)
-- Keywords.
local keyword = token(l.KEYWORD, word_match{
'abstract', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class',
'const', 'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else',
'enum', 'export', 'extends', 'false', 'final', 'finally', 'float', 'for',
'goto', 'if', 'implements', 'import', 'in', 'instanceof', 'int',
'interface', 'let', 'long', 'native', 'new', 'null', 'undefined', 'package', 'private',
'protected', 'public', 'return', 'short', 'static', 'super', 'switch',
'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
'typeof', 'var', 'void', 'volatile', 'while', 'with', 'yield', 'function',
'from', 'of', 'global', 'console', 'constructor', 'async', 'await', 'as',
'arguments', 'eval'
})
-- Identifiers.
local identifier = token(l.IDENTIFIER, l.word)
local variable = token(l.VARIABLE, l.word)
local funcname = token(l.FUNCTION, l.word) - keyword
-- Operators.
local operator = token(l.OPERATOR, S('+-/*%^!=&|?:;,.()[]{}<>'))
local splat = token(l.OPERATOR, P('...'))
local colon = token(l.OPERATOR, P(','))
local double = token(l.OPERATOR, P(':'))
local opencurly = token(l.OPERATOR, S('{'))
local closecurly = token(l.OPERATOR, S('}'))
local openbracket = token(l.OPERATOR, S('['))
local closebracket = token(l.OPERATOR, S(']'))
-- array unpacking: let f([a, b, ...rest], c, d) => null
-- XXX: no recursive unpacking, missing colons are not understood
local unpack_array = ws^0 * variable * ws^0 * colon^0
local unpack_array_rest = ws^0 * splat * variable
local unpack_array_parenth = openbracket * unpack_array^0 * unpack_array_rest^-1 * ws^0 * closebracket
local unpack_object_parenth = opencurly * unpack_array^0 * unpack_array_rest^-1 * ws^0 * closecurly
-- local sig_default_param = token(l.OPERATOR, S('=')) * ws^0 * (unpack_object_parenth + unpack_array_parenth)
local scalar = number + identifier + string
local dictentry = scalar * ws^0 * double * ws^0 * scalar
local sig_default_param0 = token(l.OPERATOR, S('=')) * ws^0 * scalar^1
local sig_default_param1 = token(l.OPERATOR, S('=')) * ws^0 * opencurly * ws^0 * (dictentry^1 * ws^0 * (colon * ws^0 * dictentry)^0 * colon^-1)^0 * ws^0 * closecurly
local sig_default_param2 = token(l.OPERATOR, S('=')) * ws^0 * openbracket * ws^0 * (scalar^1 * ws^0 * (colon * ws^0 * scalar)^0 * colon^-1)^0 * ws^0 * closebracket
local sig_default_param = sig_default_param0 + sig_default_param1 + sig_default_param2
local func_sig = ws^0 * ((splat^-1 * variable) + (unpack_array_parenth + unpack_object_parenth)) * ws^0 * sig_default_param^0 * ws^0 * colon^0
local func_sig_parenthesis = token(l.OPERATOR, S('(')) * func_sig^0 * ws^0 * token(l.OPERATOR, S(')'))
-- plain and generator form
local func_keyword = token(l.KEYWORD, P('function')) * (ws^0 * token(l.OPERATOR, S('*')))
-- form: "function XXX()"
local func1 = func_keyword * ws^1 * funcname * ws^0 * func_sig_parenthesis
-- form: "XXX = function()"
local func2 = funcname * ws^0 * token(l.OPERATOR, S('=:')) * ws^0 * func_keyword * ws^0 * func_sig_parenthesis
-- in es6 class interface: "XXX() {}"
local func3 = funcname * ws^0 * func_sig_parenthesis * ws^0 * opencurly
local get_set = token(l.KEYWORD, P('get') + P('set'))
local func3_get_set = get_set * ws^1 * func3
-- colorize empty () as variable
local lambda = (token(l.VARIABLE, P('()')) + variable + func_sig_parenthesis) * ws^0 * token(l.OPERATOR, S('=')) * token(l.OPERATOR, S('>'))
-- es6 form: XXX = () => asasd
local func4 = funcname * ws^0 * token(l.OPERATOR, S('=:')) * ws^0 * lambda
local in_jsx_attr = P(function(input, index)
--~ local f = io.open('t:/scite_lex.log', 'a')
local _, idx_close, idx_open
local idx = index
local stack = 1
while stack > 0 do
_, idx_close = input:find('}', idx)
_, idx_open = input:find('{', idx)
if idx_close and (not idx_open or idx_close < idx_open) then
stack = stack - 1
idx = idx_close + 1
elseif idx_open then
stack = stack + 1
idx = idx_open + 1
end
if not idx_close and not idx_open then
idx = nil
break
end;
end
--~ f:flush()
return idx - 1
end)
-- Dirk: not sure whether this newline stuff is really needed
-- local newline = (P"\r\n" + P"\n\r" + S"\r\n")
local ws_comment = (ws^0 * comment)^0 * ws
local jsx_tag = l.word * (P('.') * l.word)^0
local jsx_attr_js = token(l.OPERATOR, S('{')) * token('embedded', in_jsx_attr) * token(l.OPERATOR, S('}'))
-- i dont think this is doable in one pass
--~ local expression = lambda + keyword + identifier + number + simplestring + templstring + operator + ws
--~ local jsx_attr_js = token(l.OPERATOR, S('{')) * expression^1 * token(l.OPERATOR, S('}'))
local jsx_attribute = token(l.LABEL, l.word) * (ws^0 * P('=') * ws^0 * (string + jsx_attr_js))^0
local jsx_attributes = ws_comment * jsx_attribute
local jsx_closing_element = token(l.TYPE, P('</') * jsx_tag * P('>'))
-- between opening and closing tags we understand only the minimal {EMBEDED_JS} version
local minmal_body = ws^0 * jsx_attr_js * ws^0 * jsx_closing_element
local element_outtro = ws_comment^0 * token(l.TYPE, P('/>') + P('>'))
local EOF = -1
-- the "-1" trailing ruling (EOF) helps if scite does not feed all required lines into the lexer
-- 1: with optional final delimiter
-- stuff = <Stuff
-- ref={(e) => scroller_ref = e}
-- key={this.state.wcsize}
local jsx_element_1 = token(l.TYPE, P('<') * jsx_tag) * jsx_attributes^0 * (element_outtro + ws_comment^0*EOF) * minmal_body^-1
-- 2: only trailing part
-- ref={(e) => scroller_ref = e}
-- key={this.state.wcsize}
-- />
local jsx_element_2 = jsx_attribute * jsx_attributes^0 * element_outtro
-- es6 classes
local classname = token(l.KEYWORD, P('class')) * ws^1 * token(l.CLASS, l.word)
M._rules = {
{'whitespace', ws},
{'func1', func1},
{'func2', func2},
{'func3', func3},
{'func3_get_set', func3_get_set},
{'func4', func4},
{'lambda', lambda},
{'class', classname},
{'jsx_element_1', jsx_element_1},
{'jsx_element_2', jsx_element_2},
{'jsx_closing_element', jsx_closing_element},
{'keyword', keyword},
{'identifier', identifier},
{'comment', comment},
{'number', number},
{'string', simplestring},
{'templstring', templstring},
{'operator', operator},
}
M._tokenstyles = {
string_template = 'string_template'
}
M._foldsymbols = {
_patterns = {'[{}]', '/%*', '%*/', '//'},
[l.OPERATOR] = {['{'] = 1, ['}'] = -1},
[l.COMMENT] = {['/*'] = 1, ['*/'] = -1, ['//'] = l.fold_line_comments('//')}
}
M._LEXBYLINE = false
return M
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment