#!/usr/bin/env python |
# mklexer.py: generate a Python lexer based on a token definitions file. |
# See https://gist.github.com/cellularmitosis/1da62db09d41703c5a505d0bac9d9056 |
# Copyright (c) 2020 Jason Pepas |
# Released under the terms of the MIT license. |
# See https://opensource.org/licenses/MIT |
import sys |
import os |
try: |
from StringIO import StringIO |
except ImportError: |
from io import StringIO |
def usage(fd): |
"""Prints the usage help to the given file descriptor.""" |
exe = os.path.basename(sys.argv[0]) |
w = fd.write |
if fd is sys.stderr: |
w("Error: bad usage.\n") |
w("\n") |
else: |
w("%s: generate a Python lexer based on a token definitions file.\n" % exe) |
w("\n") |
w("Display help:\n") |
w(" %s -h\n" % exe) |
w(" %s --help\n" % exe) |
w("\n") |
w("Generate a lexer using tokendefs.txt:\n") |
w(" %s tokendefs.txt > lexer.py\n" % exe) |
w(" chmod +x lexer.py\n") |
w(""" |
tokendefs.txt consists of pairs of TOKENTYPE and <regex> lines. |
Example tokendefs.txt: |
-?\\d+(\\.\\d+)? |
[a-zA-Z_][a-zA-Z0-9_-]* |
Use the lexer on input.txt, producing the standard JSON token format: |
./lexer.py input.txt | jq . |
Two example tokens in standard JSON format: |
{'type': 'TOKEN', 'token_type': 'NUMBER', 'text': '3.14159'} |
{'type': 'TOKEN', 'token_type': 'SYMBOL', 'text': 'fibonacci'} |
Use the lexer on input.txt, producing "fast" array-based JSON tokens: |
./lexer.py --fast input.txt | jq . |
"fast" tokens are [<token type index>, <matched text>] pairs. |
The same example tokens, but in 'fast' JSON format: |
[0, '3.14159'] |
[1, 'fibonacci'] |
tokendefs.txt may also contain #pragma's: line-oriented, discard, eof, refine. |
""") |
def parse_tokendefs(lines): |
"""Parses the token definitions file, stopping at the first 'refine' pragma.""" |
tokendefs = [] |
pragmas = {} |
i = 0 |
while i < len(lines): |
line = lines[i] |
if len(line) == 0: |
# skip blank lines |
i += 1 |
continue |
words = line.split() |
if words[0] == '#pragma': |
if len(words) == 1: |
raise Exception("Can't parse pragma: %s" % line) |
pragma_name = words[1] |
if pragma_name in ['line-oriented', 'eof']: |
pragmas[pragma_name] = True |
i += 1 |
continue |
elif pragma_name == 'discard': |
discardable_token_types = words[2:] |
if len(discardable_token_types) == 0: |
raise Exception("Discard pragma with no token types listed") |
pragmas[pragma_name] = discardable_token_types |
i += 1 |
continue |
elif pragma_name == 'refine': |
break |
else: |
raise Exception("Unknown pragma '%s'" % pragma_name) |
if line[0] == '#': |
# this is just a comment. |
i += 1 |
continue |
tokentype = line |
i += 1 |
if i >= len(lines): |
raise Exception( |
"Line %d: Token type '%s' has no corresponding regex." \ |
% (i, tokentype) |
) |
regex = lines[i] |
if len(regex) == 0: |
raise Exception("Line %d: Zero-length regex." % i+1) |
i += 1 |
pair = (tokentype, regex) |
tokendefs.append(pair) |
continue |
remaining_lines = lines[i:] |
return (pragmas, tokendefs, remaining_lines) |
def parse_refine_section(lines): |
"""Parses a 'refine' pragma.""" |
line1 = lines[0] |
if len(lines) == 1: |
raise Exception("Empty refine section: '%s'" % line1) |
refined_token_type = line1.split()[2] |
refine_defs = [] |
i = 1 |
while i < len(lines): |
line = lines[i] |
if len(line) == 0: |
# skip blank lines |
i += 1 |
continue |
words = line.split() |
if words[0] == '#pragma': |
if len(words) == 1: |
raise Exception("Can't parse pragma: %s" % line) |
pragma_name = words[1] |
if pragma_name == 'refine': |
break |
else: |
raise Exception("Pragma not allowed in 'refine' section: '%s'" % pragma_name) |
if line[0] == '#': |
# this is just a comment. |
i += 1 |
continue |
tokentype = line |
i += 1 |
if i >= len(lines): |
raise Exception( |
"Line %d: Token type '%s' has no corresponding regex." \ |
% (i+1, tokentype) |
) |
regex = lines[i] |
if len(tokentype) == 0: |
raise Exception("Line %d: Zero-length regex." % i+1) |
i += 1 |
pair = (tokentype, regex) |
refine_defs.append(pair) |
continue |
refine_struct = [refined_token_type, refine_defs] |
remaining_lines = lines[i:] |
return (refine_struct, remaining_lines) |
def codegen_pragmas(pragmas): |
"""Generates the Python code for pragmas.""" |
fd = StringIO() |
w = fd.write |
is_line_oriented = 'line-oriented' in pragmas.keys() |
w("pragma_line_oriented = %s\n" % is_line_oriented) |
has_eof_pragma = 'eof' in pragmas.keys() |
w("pragma_eof = %s\n" % has_eof_pragma) |
if 'discard' in pragmas.keys(): |
toktypes_string = '[%s]' % ','.join( |
[("'%s'" % token_type) for token_type in pragmas['discard']] |
) |
w("pragma_discard = %s\n" % toktypes_string) |
else: |
w("pragma_discard = []\n") |
w('\n') |
code = fd.getvalue() |
fd.close() |
return code |
def codegen_regex(regex_text): |
"""Generates the Python code of a regex.""" |
# do everything we can to avoid the backslash plague. |
if "'" not in regex_text: |
return "r'%s'" % regex_text |
elif '"' not in regex_text: |
return 'r"%s"' % regex_text |
elif "'''" not in regex_text and not regex_text.startswith("'") and not regex_text.endswith("'"): |
return "r'''%s'''" % regex_text |
elif '"""' not in regex_text and not regex_text.startswith('"') and not regex_text.endswith('"'): |
return 'r"""%s"""' % regex_text |
else: |
# oh well, at least we tried :shrug: |
return regex_text.__repr__() |
def codegen_tokendefs(tokendefs): |
"""Generates the Python code of the tokendefs table.""" |
fd = StringIO() |
w = fd.write |
w("tokendefs = [\n") |
for token_type, regex in tokendefs: |
w(" ['%s', %s],\n" % (token_type, codegen_regex(regex))) |
w("]\n") |
code = fd.getvalue() |
fd.close() |
return code |
def codegen_refinements(refines): |
"""Generates the Python code of the refinements table.""" |
fd = StringIO() |
w = fd.write |
w("refinements = [\n") |
for refined_token_type, tokendefs in refines: |
w(" ['%s', [\n" % refined_token_type) |
for token_type, regex in tokendefs: |
w(" ['%s', %s],\n" % (token_type, codegen_regex(regex))) |
w(" ]],\n") |
w("]\n") |
code = fd.getvalue() |
fd.close() |
return code |
def codegen_toktypes(pragmas, tokendefs, refines): |
toktypes = [] |
for tokendef in tokendefs: |
toktype = tokendef[0] |
toktypes.append(toktype) |
for refinement in refines: |
for tokdef in refinement[1]: |
toktype = tokdef[0] |
toktypes.append(toktype) |
if 'eof' in pragmas.keys(): |
toktypes.append('EOF') |
return "toktypes = %s\n" % toktypes |
def codegen(pragmas, tokendefs, refines): |
"""Generates the Python code of the lexer.""" |
fd = StringIO() |
w = fd.write |
w("""#!/usr/bin/env python |
# DO NOT EDIT: this lexer was generated by mklexer.py. |
import sys |
import re |
import json |
""") |
pragmas_code = codegen_pragmas(pragmas) |
w(pragmas_code) |
tokendefs_code = codegen_tokendefs(tokendefs) |
w(tokendefs_code + '\n') |
refinements_code = codegen_refinements(refines) |
w(refinements_code + '\n') |
toktypes_code = codegen_toktypes(pragmas, tokendefs, refines) |
w(toktypes_code) |
w(""" |
def compile_regexes(): |
\"\"\"Compile the regexes.\"\"\" |
for pair in tokendefs: |
pair[1] = re.compile(pair[1]) |
for refinement in refinements: |
for pair in refinement[1]: |
pair[1] = re.compile(pair[1] + '$') |
compile_regexes() |
def get_linenum_charnum(text, offset): |
\"\"\"Returns the line number and character number of the offset.\"\"\" |
linenum = 1 |
charnum = 1 |
i = 0 |
while i < offset: |
if text[i] == '\\n': |
linenum += 1 |
charnum = 1 |
i += 1 |
continue |
else: |
charnum += 1 |
i += 1 |
continue |
return (linenum, charnum) |
def consume_next_token(text, offset, use_fast_format): |
\"\"\"Consumes next token from the given text input. |
Returns a (token, offset) pair. |
Throws if no tokens match.\"\"\" |
for i, pair in enumerate(tokendefs): |
(token_type, regex) = pair |
m = regex.match(text, offset) |
if m is None: |
continue |
matched_text = m.group() |
if use_fast_format: |
token = [i, matched_text] |
else: |
token = { |
'type': 'token', |
'token_type': token_type, |
'text': matched_text, |
} |
new_offset = offset + len(matched_text) |
return (token, new_offset) |
# none of the token types matched |
(linenum, charnum) = get_linenum_charnum(text, offset) |
raise Exception( |
"Can't lex starting at line %d, character %d, context: '%s'" \\ |
% (linenum, charnum, text[offset:offset+32]) |
) |
def discard_tokens(tokens, use_fast_format): |
\"\"\"Discards any tokens specified by the 'discard' pragma.\"\"\" |
def make_discard_set(): |
if use_fast_format: |
discard_set = set() |
for i, pair in enumerate(tokendefs): |
(token_type, _) = pair |
if token_type in pragma_discard: |
discard_set.add(i) |
continue |
else: |
discard_set = set(pragma_discard) |
return discard_set |
discard_set = make_discard_set() |
kept_tokens = [] |
for token in tokens: |
if use_fast_format: |
toktype = token[0] |
else: |
toktype = token['token_type'] |
if toktype not in discard_set: |
kept_tokens.append(token) |
continue |
return kept_tokens |
def make_lines(tokens, use_fast_format): |
\"\"\"Return a line-oriented array-of-arrays from the given tokens.\"\"\" |
lines = [] |
line = [] |
for token in tokens: |
if use_fast_format: |
text = token[1] |
else: |
text = token['text'] |
if text == '\\n': |
lines.append(line) |
line = [] |
continue |
else: |
line.append(token) |
continue |
if len(line) > 0: |
lines.append(line) |
return lines |
def refine(tokens, use_fast_format): |
\"\"\"Further refine the lexed tokens.\"\"\" |
for refinement in refinements: |
refineable_token_type = refinement[0] |
pairs = refinement[1] |
for token in tokens: |
replacement_token = None |
if use_fast_format: |
token_type_index = token[0] |
token_type = toktypes[token_type_index] |
if token_type != refineable_token_type: |
continue |
for refined_token_type, regex in pairs: |
token_text = token[1] |
m = regex.match(token_text) |
if m: |
token[0] = toktypes.index(refined_token_type) |
continue |
continue |
else: |
if token['token_type'] != refineable_token_type: |
continue |
for refined_token_type, regex in pairs: |
m = regex.match(token['text']) |
if m: |
token['token_type'] = refined_token_type |
continue |
continue |
continue |
return |
def lex(text, use_fast_format): |
\"\"\"Returns a list of tokens for the given text input.\"\"\" |
tokens = [] |
offset = 0 |
while offset < len(text): |
(token, offset) = consume_next_token(text, offset, use_fast_format) |
tokens.append(token) |
continue |
refine(tokens, use_fast_format) |
tokens = discard_tokens(tokens, use_fast_format) |
if pragma_line_oriented: |
tokens = make_lines(tokens, use_fast_format) |
if pragma_eof: |
if use_fast_format: |
eof_token = [len(toktypes)-1, ""] |
else: |
eof_token = { |
"type": "token", |
"token_type": "EOF", |
"text": "" |
} |
if pragma_line_oriented: |
tokens.append([eof_token]) |
else: |
tokens.append(eof_token) |
format_dict = {'type': 'format'} |
if use_fast_format: |
if pragma_line_oriented: |
format_dict['format'] = 'fast-lines' |
else: |
format_dict['format'] = 'fast' |
format_dict['token_types'] = toktypes |
else: |
if pragma_line_oriented: |
format_dict['format'] = 'tokens-lines' |
else: |
format_dict['format'] = 'tokens' |
json_obj = [format_dict, tokens] |
return json_obj |
if __name__ == '__main__': |
infile = [arg for arg in sys.argv[1:] if not arg.startswith('-')][-1] |
use_fast_format = False |
if '--fast' in sys.argv[1:]: |
use_fast_format = True |
fd = open(infile, 'r') |
text = fd.read() |
fd.close() |
json_obj = lex(text, use_fast_format) |
output = json.dumps(json_obj) |
if not output.endswith('\\n'): |
output += '\\n' |
sys.stdout.write(output) |
""") |
code = fd.getvalue() |
fd.close() |
return code |
if __name__ == "__main__": |
if len(sys.argv) < 2: |
usage(sys.stderr) |
sys.exit(1) |
if '-h' in sys.argv or '--help' in sys.argv: |
usage(sys.stdout) |
sys.exit(0) |
# the last non-option arg is the tokendefs file. |
tokendefs_fpath = None |
non_option_args = [arg for arg in sys.argv[1:] if not arg.startswith('-')] |
if len(non_option_args) != 1: |
usage(sys.stderr) |
sys.exit(1) |
tokendefs_fpath = non_option_args[0] |
fd = open(tokendefs_fpath, 'r') |
tokendefs_lines = fd.read().splitlines() |
fd.close() |
(pragmas, tokendefs, remaining_lines) = parse_tokendefs(tokendefs_lines) |
refines = [] |
while len(remaining_lines) > 0: |
(refine_struct, remaining_lines) = parse_refine_section(remaining_lines) |
refines.append(refine_struct) |
code = codegen(pragmas, tokendefs, refines) |
sys.stdout.write(code) |