cellularmitosis · March 29, 2023 01:07
diff --git a/README.md b/README.md
diff --git a/input1.txt b/input1.txt
 (define pi 3.14159)
diff --git a/input2.txt b/input2.txt
 (define fibonacci
  (lambda (n)
    (if (lessthan n 2)
      n
      (sum (fibonacci (sum n -1)) (fibonacci (sum n -2))))))
diff --git a/Makefile b/Makefile
 lexer.py: mklexer.py tokendefs.txt
 	./mklexer.py tokendefs.txt > lexer.py
 	chmod +x lexer.py

 clean:
 	rm -f lexer.py

 .PHONY: clean
diff --git a/mklexer.py b/mklexer.py
 #!/usr/bin/env python

 # mklexer.py: generate a Python lexer based on a token definitions file.
 # See https://github.com/pepaslabs/mklexer.py

 # Copyright (c) 2020 Jason Pepas
 # Released under the terms of the MIT license.
 # See https://opensource.org/licenses/MIT

 import sys
 import os

 try:
    from StringIO import StringIO
 except ImportError:
    from io import StringIO

 def usage(fd):
    """Prints the usage help to the given file descriptor."""
    exe = os.path.basename(sys.argv[0])
    w = fd.write
    if fd is sys.stderr:
        w("Error: bad usage.\n")
        w("\n")
    else:
        w("%s: generate a Python lexer based on a token definitions file.\n" % exe)
        w("\n")
    w("Display help:\n")
    w("  %s -h\n" % exe)
    w("  %s --help\n" % exe)
    w("\n")
    w("Generate a lexer using tokendefs.txt:\n")
    w("  %s tokendefs.txt > lexer.py\n" % exe)
    w("  chmod +x lexer.py\n")
    w("""
 tokendefs.txt consists of pairs of TOKENTYPE and <regex> lines.

 Example tokendefs.txt:
 NUMBER
 -?\\d+(\\.\\d+)?
 SYMBOL
 [a-zA-Z_][a-zA-Z0-9_-]*

 Use the lexer on input.txt, producing the standard JSON token format:
  ./lexer.py input.txt | jq .

 Two example tokens in standard JSON format:
  {'type': 'TOKEN', 'token_type': 'NUMBER', 'text': '3.14159'}
  {'type': 'TOKEN', 'token_type': 'SYMBOL', 'text': 'fibonacci'}

 Use the lexer on input.txt, producing "fast" array-based JSON tokens:
  ./lexer.py --fast input.txt | jq .

 "fast" tokens are [<token type index>, <matched text>] pairs.

 The same example tokens, but in 'fast' JSON format:
  [0, '3.14159']
  [1, 'fibonacci']
 """)

 def load_tokendefs(fpath):
    """Loads the token definitions file."""
    tokendefs = []
    fd = open(fpath, 'r')
    content = fd.read()
    fd.close()
    lines = content.splitlines()
    i = 0
    while i < len(lines):
        tokentype = lines[i]
        if len(tokentype) == 0:
            raise Exception("Line %d: Zero-length token type." % i+1)
        i += 1
        if i >= len(lines):
            raise Exception(
                "Line %d: Token type '%s' has no corresponding regex." \
                    % (i, tokentype)
            )
        regex = lines[i]
        if len(regex) == 0:
            raise Exception("Line %d: Zero-length regex." % i+1)
        i += 1
        pair = (tokentype, regex)
        tokendefs.append(pair)
        continue
    return tokendefs

 def codegen_tokendefs(tokendefs):
    """Generates the Python code of the tokendefs table."""
    fd = StringIO()
    w = fd.write
    w("tokendefs = [\n")
    for token_type, regex in tokendefs:
        w("    ('%s', " % token_type)
        # do everything we can to avoid the backslash plague.
        if "'" not in regex:
            w("r'%s'" % regex)
        elif '"' not in regex:
            w('r"%s"' % regex)
        elif "'''" not in regex and not regex.startswith("'") and not regex.endswith("'"):
            w("r'''%s'''" % regex)
        elif '"""' not in regex and not regex.startswith('"') and not regex.endswith('"'):
            w('r"""%s"""' % regex)
        else:
            # oh well, at least we tried :shrug:
            w(regex.__repr__())
        w("),\n")
    w("]\n")
    code = fd.getvalue()
    fd.close()
    return code

 def codegen(tokendefs):
    """Generates the Python code of the lexer."""
    fd = StringIO()
    w = fd.write
    w("""#!/usr/bin/env python

 # DO NOT EDIT: this lexer was generated by mklexer.py.

 import sys
 import re
 import json

 """)
    tokendefs_code = codegen_tokendefs(tokendefs)
    w(tokendefs_code)
    w("""
 tokendefs = [(toktype, re.compile(regex)) for (toktype, regex) in tokendefs]

 def get_linenum_charnum(text, offset):
    \"\"\"Returns the line number and character number of the offset.\"\"\"
    linenum = 1
    charnum = 1
    i = 0
    while i < offset:
        if text[i] == '\\n':
            linenum += 1
            charnum = 1
            continue
        else:
            charnum += 1
            continue
    return (linenum, charnum)

 def consume_next_token(text, offset, use_fast_format):
    \"\"\"Consumes next token from the given text input.
    Returns a (token, offset) pair.
    Throws if no tokens match.\"\"\"
    i = 0
    while i < len(tokendefs):
        (token_type, regex) = tokendefs[i]
        m = regex.match(text, offset)
        if m is None:
            i += 1
            continue
        else:
            matched_text = m.group()
            if use_fast_format:
                token = [i, matched_text]
            else:
                token = {
                    'type': 'TOKEN',
                    'token_type': token_type,
                    'text': matched_text,
                }
            new_offset = offset + len(matched_text)
            return (token, new_offset)
    # none of the token types matched
    (linenum, charnum) = get_linenum_charnum(text, offset)
    raise Exception(
        \"Can't lex starting at line %d, character %d, context: '%s'\" \\
            % (linenum, charnum, text[offset:offset+32])
    )

 def lex(text, use_fast_format):
    \"\"\"Returns a list of tokens for the given text input.\"\"\"
    tokens = []
    offset = 0
    while offset < len(text):
        (token, offset) = consume_next_token(text, offset, use_fast_format)
        tokens.append(token)
        continue
    return tokens

 if __name__ == '__main__':
    infile = [arg for arg in sys.argv[1:] if not arg.startswith('-')][-1]
    use_fast_format = False
    if '--fast' in sys.argv[1:]:
        use_fast_format = True
    fd = open(infile, 'r')
    text = fd.read()
    fd.close()
    tokens = lex(text, use_fast_format)
    output = json.dumps(tokens)
    if not output.endswith('\\n'):
        output += '\\n'
    sys.stdout.write(output)
 """)
    code = fd.getvalue()
    fd.close()
    return code

 if __name__ == "__main__":
    if len(sys.argv) < 2:
        usage(sys.stderr)
        sys.exit(1)

    if '-h' in sys.argv or '--help' in sys.argv:
        usage(sys.stdout)
        sys.exit(0)

    # the last non-option arg is the tokendefs file.
    tokendefs_fpath = None
    non_option_args = [arg for arg in sys.argv[1:] if not arg.startswith('-')]
    if len(non_option_args) != 1:
        usage(sys.stderr)
        sys.exit(1)
    tokendefs_fpath = non_option_args[0]

    tokendefs = load_tokendefs(tokendefs_fpath)
    code = codegen(tokendefs)
    sys.stdout.write(code)
diff --git a/tokendefs.txt b/tokendefs.txt
 OPAREN
 \(
 CPAREN
 \)
 NUMBER
 -?\d+(\.\d+)?
 SYMBOL
 [a-z]+
 WSPACE
 \s+
	(define fibonacci
	(lambda (n)
	(if (lessthan n 2)
	n
	(sum (fibonacci (sum n -1)) (fibonacci (sum n -2))))))
	lexer.py: mklexer.py tokendefs.txt
	./mklexer.py tokendefs.txt > lexer.py
	chmod +x lexer.py

	clean:
	rm -f lexer.py

	.PHONY: clean
	#!/usr/bin/env python

	# mklexer.py: generate a Python lexer based on a token definitions file.
	# See https://github.com/pepaslabs/mklexer.py

	# Copyright (c) 2020 Jason Pepas
	# Released under the terms of the MIT license.
	# See https://opensource.org/licenses/MIT

	import sys
	import os

	try:
	from StringIO import StringIO
	except ImportError:
	from io import StringIO

	def usage(fd):
	"""Prints the usage help to the given file descriptor."""
	exe = os.path.basename(sys.argv[0])
	w = fd.write
	if fd is sys.stderr:
	w("Error: bad usage.\n")
	w("\n")
	else:
	w("%s: generate a Python lexer based on a token definitions file.\n" % exe)
	w("\n")
	w("Display help:\n")
	w(" %s -h\n" % exe)
	w(" %s --help\n" % exe)
	w("\n")
	w("Generate a lexer using tokendefs.txt:\n")
	w(" %s tokendefs.txt > lexer.py\n" % exe)
	w(" chmod +x lexer.py\n")
	w("""
	tokendefs.txt consists of pairs of TOKENTYPE and <regex> lines.

	Example tokendefs.txt:
	NUMBER
	-?\\d+(\\.\\d+)?
	SYMBOL
	[a-zA-Z_][a-zA-Z0-9_-]*

	Use the lexer on input.txt, producing the standard JSON token format:
	./lexer.py input.txt \| jq .

	Two example tokens in standard JSON format:
	{'type': 'TOKEN', 'token_type': 'NUMBER', 'text': '3.14159'}
	{'type': 'TOKEN', 'token_type': 'SYMBOL', 'text': 'fibonacci'}

	Use the lexer on input.txt, producing "fast" array-based JSON tokens:
	./lexer.py --fast input.txt \| jq .

	"fast" tokens are [<token type index>, <matched text>] pairs.

	The same example tokens, but in 'fast' JSON format:
	[0, '3.14159']
	[1, 'fibonacci']
	""")

	def load_tokendefs(fpath):
	"""Loads the token definitions file."""
	tokendefs = []
	fd = open(fpath, 'r')
	content = fd.read()
	fd.close()
	lines = content.splitlines()
	i = 0
	while i < len(lines):
	tokentype = lines[i]
	if len(tokentype) == 0:
	raise Exception("Line %d: Zero-length token type." % i+1)
	i += 1
	if i >= len(lines):
	raise Exception(
	"Line %d: Token type '%s' has no corresponding regex." \
	% (i, tokentype)
	)
	regex = lines[i]
	if len(regex) == 0:
	raise Exception("Line %d: Zero-length regex." % i+1)
	i += 1
	pair = (tokentype, regex)
	tokendefs.append(pair)
	continue
	return tokendefs

	def codegen_tokendefs(tokendefs):
	"""Generates the Python code of the tokendefs table."""
	fd = StringIO()
	w = fd.write
	w("tokendefs = [\n")
	for token_type, regex in tokendefs:
	w(" ('%s', " % token_type)
	# do everything we can to avoid the backslash plague.
	if "'" not in regex:
	w("r'%s'" % regex)
	elif '"' not in regex:
	w('r"%s"' % regex)
	elif "'''" not in regex and not regex.startswith("'") and not regex.endswith("'"):
	w("r'''%s'''" % regex)
	elif '"""' not in regex and not regex.startswith('"') and not regex.endswith('"'):
	w('r"""%s"""' % regex)
	else:
	# oh well, at least we tried :shrug:
	w(regex.__repr__())
	w("),\n")
	w("]\n")
	code = fd.getvalue()
	fd.close()
	return code

	def codegen(tokendefs):
	"""Generates the Python code of the lexer."""
	fd = StringIO()
	w = fd.write
	w("""#!/usr/bin/env python

	# DO NOT EDIT: this lexer was generated by mklexer.py.

	import sys
	import re
	import json

	""")
	tokendefs_code = codegen_tokendefs(tokendefs)
	w(tokendefs_code)
	w("""
	tokendefs = [(toktype, re.compile(regex)) for (toktype, regex) in tokendefs]

	def get_linenum_charnum(text, offset):
	\"\"\"Returns the line number and character number of the offset.\"\"\"
	linenum = 1
	charnum = 1
	i = 0
	while i < offset:
	if text[i] == '\\n':
	linenum += 1
	charnum = 1
	continue
	else:
	charnum += 1
	continue
	return (linenum, charnum)

	def consume_next_token(text, offset, use_fast_format):
	\"\"\"Consumes next token from the given text input.
	Returns a (token, offset) pair.
	Throws if no tokens match.\"\"\"
	i = 0
	while i < len(tokendefs):
	(token_type, regex) = tokendefs[i]
	m = regex.match(text, offset)
	if m is None:
	i += 1
	continue
	else:
	matched_text = m.group()
	if use_fast_format:
	token = [i, matched_text]
	else:
	token = {
	'type': 'TOKEN',
	'token_type': token_type,
	'text': matched_text,
	}
	new_offset = offset + len(matched_text)
	return (token, new_offset)
	# none of the token types matched
	(linenum, charnum) = get_linenum_charnum(text, offset)
	raise Exception(
	\"Can't lex starting at line %d, character %d, context: '%s'\" \\
	% (linenum, charnum, text[offset:offset+32])
	)

	def lex(text, use_fast_format):
	\"\"\"Returns a list of tokens for the given text input.\"\"\"
	tokens = []
	offset = 0
	while offset < len(text):
	(token, offset) = consume_next_token(text, offset, use_fast_format)
	tokens.append(token)
	continue
	return tokens

	if __name__ == '__main__':
	infile = [arg for arg in sys.argv[1:] if not arg.startswith('-')][-1]
	use_fast_format = False
	if '--fast' in sys.argv[1:]:
	use_fast_format = True
	fd = open(infile, 'r')
	text = fd.read()
	fd.close()
	tokens = lex(text, use_fast_format)
	output = json.dumps(tokens)
	if not output.endswith('\\n'):
	output += '\\n'
	sys.stdout.write(output)
	""")
	code = fd.getvalue()
	fd.close()
	return code

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	usage(sys.stderr)
	sys.exit(1)

	if '-h' in sys.argv or '--help' in sys.argv:
	usage(sys.stdout)
	sys.exit(0)

	# the last non-option arg is the tokendefs file.
	tokendefs_fpath = None
	non_option_args = [arg for arg in sys.argv[1:] if not arg.startswith('-')]
	if len(non_option_args) != 1:
	usage(sys.stderr)
	sys.exit(1)
	tokendefs_fpath = non_option_args[0]

	tokendefs = load_tokendefs(tokendefs_fpath)
	code = codegen(tokendefs)
	sys.stdout.write(code)