Skip to content

Instantly share code, notes, and snippets.

@ninjapanzer
Forked from swarley/lexer.ex
Created February 28, 2013 20:04
Show Gist options
  • Save ninjapanzer/5059661 to your computer and use it in GitHub Desktop.
Save ninjapanzer/5059661 to your computer and use it in GitHub Desktop.
defmodule Lexer do
defmodule PIR do
require LexChar
defmacrop line(opts) do
quote do: elem(unquote(opts),1)
end
defmacrop file(opts) do
quote do: elem(unquote(opts),2)
end
defmacrop nl(opts) do
quote do: elem(unquote(opts),0)
end
def tokenize(str) do
if is_binary(str) do
str = binary_to_list(str)
end
Enum.reverse tokenize(str, [], {true, 1, "nofile"})
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize with all given options that are avalible. |
#| @param str [binary,list] The string to tokenize. |
#| @param nl [true,false] Whether the string is the beginning of a new line. |
#| @param ln [integer] The line number of the string. |
#| @param nm [string] The name of the file that the input is from. |
#*-------------------------------------------------------------------------------------------/
def tokenize(str, nl, ln, nm) do
Enum.reverse tokenize(str, [], [nl, ln, nm])
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a hexadecimal integer |
#*-------------------------------------------------------------------------------------------/
defp tokenize([?0, x, head | tail], tokens, opts) when (x == ?x or x == ?X) and LexChar.is_hex(head) do
{text, hex_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_hex(c) end, 16, [])
tokenize(text, [{:int, line(opts), hex_num} | tokens], {false, line(opts), file(opts)})
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a prefixed hexadecimal integer |
#*-------------------------------------------------------------------------------------------/
defp tokenize([p, ?0, x, head | tail], tokens, opts) when (p == ?+ or p == ?-) and (x == ?x or x == ?X) and LexChar.is_hex(head) do
{text, hex_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_hex(c) end, 16, [])
mul = :erlang.list_to_integer([p,?1])
tokenize(text, [{:int, line(opts), mul*hex_num} | tokens], {false, line(opts), file(opts)})
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a binary integer |
#*-------------------------------------------------------------------------------------------/
defp tokenize([?0, b, head | tail], tokens, opts) when (b == ?b or b == ?B) and LexChar.is_bin(head) do
{text, bin_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_bin(c) end, 2, [])
tokenize(text, [{:int, line(opts), bin_num} | tokens], {false, line(opts), file(opts)})
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize prefixed binary integer |
#*-------------------------------------------------------------------------------------------/
defp tokenize([p, ?0, b, head | tail], tokens, opts) when (p == ?+ or p == ?-) and (b == ?b or b == ?B) and LexChar.is_bin(head) do
{text, bin_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_bin(c) end, 2, [])
mul = :erlang.list_to_integer([p,?1])
tokenize(text, [{:int, line(opts), mul*bin_num} | tokens], {false, line(opts), file(opts)})
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a number |
#*-------------------------------------------------------------------------------------------/
defp tokenize([head|tail], tokens, opts) when LexChar.is_digit(head) do
{text, int} = tokenize_int([head|tail], fn(c) -> LexChar.is_digit(c) end, 10, [])
if Enum.first(text) == ?. and LexChar.is_digit(Enum.at!(text,1)) do
{text, float} = tokenize_float(Enum.drop(text, 1), int, [])
tokenize(text, [{:float, line(opts), float} | tokens], {false, line(opts), file(opts)})
else
tokenize(text, [{:int, line(opts), int} | tokens], {false, line(opts), file(opts)})
end
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a newline |
#*-------------------------------------------------------------------------------------------/
defp tokenize([?\n | tail], tokens, opts) do
tokenize(tail, [{:newline, line(opts), nil} | tokens], {true, line(opts) + 1, file(opts)})
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a single quote string |
#*-------------------------------------------------------------------------------------------/
defp tokenize([?' | tail], tokens, opts) do
{text, str} = tokenize_single_string(tail, [])
tokenize(text, [{:str, line(opts), str} | tokens], {false, line(opts), file(opts)})
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a double quote string |
#*-------------------------------------------------------------------------------------------/
defp tokenize([?" | tail], tokens, opts) do
{text, str} = tokenize_double_string(tail, [])
tokenize(text, [{:str, line(opts), str} | tokens], {false, line(opts), file(opts)})
end
# Register
defp tokenize([?$, t | tail], tokens, opts) when t == ?S or t == ?N or t == ?P or t == ?I do
{text, id} = tokenize_int(tail, fn(c) -> LexChar.is_digit(c) end, 10, [])
if !is_integer(id) do
{:error, "Invalid register name"}
else
tokenize(text, [{:reg, line(opts), {list_to_atom(t), id}} | tokens], {false, line(opts), file(opts)})
end
end
# Heredoc
#defp tokenize([?<, ?<, ?" | tail], tokens, opts) do
#{text, delimiter} = tokenize_double_string(tail, [])
#{text, lines, string} = tokenize_heredoc(delimiter, 0)
#tokenize(text, [{:str, line(opts), string}|tokens], {true, line(opts) + lines, file(opts)})
#end
# Heredoc
# Add a queue so that it isn't handled until the newline is met?
# Maybe create a function that inserts it into the place of the token
# and create a placeholder within the token array so that it isn't displaced.
#defp tokenize([?<, ?<, ?' | tail], tokens, opts) do
#{text, delimiter} = tokenize_single_string(tail, [])
#{text, lines, string} = tokenize_heredoc(delimiter,0)
#tokenize(text, [{:str, line(opts), string}|tokens], {true, line(opts) + lines, file(opts)})
#end
# Standard comment
defp tokenize([?# | tail], tokens, opts) do
tokenize(:lists.dropwhile(fn x -> x != ?\n end, tail), tokens, opts)
end
#defp tokenize([?=,?b,?e,?g,?i,?n|tail], tokens, opts) when nl(opts) do
# POD
#end
defp tokenize([single_char | tail], tokens, opt) do
name = case single_char do
?. -> :dot
?, -> :comma
?; -> :semicolon
?< -> :lt
?> -> :gt
?= -> :assign
?- -> :minus
?+ -> :plus
?% -> :mod
?/ -> :divide
?* -> :multiply
?~ -> :bitnot
?! -> :not
?| -> :bitor
?& -> :bitand
?( -> :lparen
?) -> :rparen
?[ -> :lbracket
?] -> :rbracket
?{ -> :lbrace
?} -> :rbrace
_ -> nil
end
if name == nil do
{:error, "Syntax error, unexpected character"}
else
if nl(opt) do
{:error, "Bare Symbols cannot be the start of a line"}
else
tokenize(tail, [{name, line(opt), nil}|tokens], {false, line(opt), file(opt)})
end
end
end
defp tokenize(_, tokens, _) do
tokens
end
#*-------------------------------------------------------------------------------------------\
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
#| Support type creation and extension of matching functions |
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
#*-------------------------------------------------------------------------------------------/
#*-------------------------------------------------------------------------------------------\
#| Tokenize a floating point number
#| Formats:
#| -> float ::= [:digit:]+ '.' [:digit]+
#*-------------------------------------------------------------------------------------------/
defp tokenize_float([head|tail], int, acc) when LexChar.is_digit(head) do
tokenize_float(tail, int, [head | acc])
end
defp tokenize_float(str, int, acc) do
{ str, int + (:math.pow(10, -length(acc)) * :erlang.list_to_integer(Enum.reverse acc)) }
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize an integer
#| @param verify [fn] The function that verifies the character
#| @param base [integer] The base of the integer.
#| @param acc [list] The accumulative list of characters in the integer
#| Formats:
#| -> int ::= [:digit:]+
#| -> hex_int ::= '0' {'x', 'X'} [:xdigit:]+
#| -> bin_int ::= '0' {'b', 'B'} {'0', '1'}+
#| @note This function does not accept these formats with the prefixes, it only accepts
#| the suffix with the base supplied.
#| @example tokenize_int("DEADBEEF", fn(c) do
#| (c >= ?0 and c <= ?9) or
#| ((c >= ?a and c <= ?z) or
#| (c >= ?A and c <= ?Z))
#| end, 16, [])
#| #=> {[], 3735928559}
#*-------------------------------------------------------------------------------------------/
defp tokenize_int([head|tail], verify, base, acc) do
if verify.(head) do
tokenize_int(tail, verify, base, [head | acc])
else
{[head|tail], :erlang.list_to_integer(Enum.reverse(acc), base)}
end
end
defp tokenize_int([], _, base, acc) do
{[], :erlang.list_to_integer(Enum.reverse(acc), base)}
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a single quoted string
#| Single quoted strings only support escaping "\" and "'"
#*-------------------------------------------------------------------------------------------/
defp tokenize_single_string([?\\, x | tail], acc) do
case x do
?\\ -> tokenize_single_string(tail, [?\\ | acc])
?' -> tokenize_single_string(tail, [?' | acc])
_ -> {:error, "Unknown character escape \\" ++ x}
end
end
defp tokenize_single_string([?' | tail ], acc) do
{tail, Enum.reverse(acc)}
end
defp tokenize_single_string([head|tail], acc) do
tokenize_single_string(tail, [head | acc])
end
defp tokenize_single_string([], _) do
{:error, "Unterminated string"}
end
#*-------------------------------------------------------------------------------------------\
#| Tokenize a double quoted string
#| Escape Sequences:
#| \a -> Bell (Alert)
#| \b -> Backspace
#| \t -> Tab
#| \n -> Newline
#| \v -> Vertical Tab
#| \f -> Form Feed
#| \r -> Return Carriage
#| \e -> Escape
#| \\ -> Backslash
#| \" -> Double Quote
#| \xHH -> Hex Character (1..2) Hex Digits)
#| \OOO -> Octal Character (1..3) Octal Digits)
#| \cX -> Control Character
#| \x{h..h} -> Hex Character (1..8 Hex Digits)
#| \uhhhh -> Unicode Character (4 Hex Digits)
#| \Uhhhhhhhh -> Unicode Character (8 Hex Digits)
#*-------------------------------------------------------------------------------------------/
defp tokenize_double_string([?\\, x | tail], acc) do
case x do
?a -> tokenize_double_string(tail, [?\a | acc])
?b -> tokenize_double_string(tail, [?\b | acc])
?t -> tokenize_double_string(tail, [?\t | acc])
?n -> tokenize_double_string(tail, [?\n | acc])
?v -> tokenize_double_string(tail, [?\v | acc])
?f -> tokenize_double_string(tail, [?\f | acc])
?r -> tokenize_double_string(tail, [?\r | acc])
?e -> tokenize_double_string(tail, [?\e | acc])
?\\ -> tokenize_double_string(tail, [?\\ | acc])
?" -> tokenize_double_string(tail, [?" | acc])
?x ->
{ hex, text } = hex_e(tail)
if !is_integer(hex) do
hex
else
tokenize_double_string(text, [hex | acc])
end
?u ->
{ uni, text } = uni_e(tail)
if !is_integer(uni) do
uni
else
tokenize_double_string(text, [uni | acc])
end
?U ->
{ uni, text } = big_uni_e(tail)
if !is_integer(uni) do
uni
else
tokenize_double_string(text, [uni | acc])
end
_ ->
{oct, text} = oct_e(tail)
if !is_integer(oct) do
oct
else
tokenize_double_string(text, [oct | acc])
end
# ?c ->
end
end
defp tokenize_double_string([?" | tail], acc) do
{tail, list_to_binary(Enum.reverse(acc))}
end
defp tokenize_double_string([head | tail], acc) do
tokenize_double_string(tail, [head | acc])
end
defp tokenize_double_string([], _) do
{:error, "Unterminated string"}
end
#*-------------------------------------------------------------------------------------------\
#| Check for a valid hexadecimal character escape
#*-------------------------------------------------------------------------------------------/
defp hex_e([?{,a,b,c,d,e,f,g,h,?}|tail]) when LexChar.hex8(a,b,c,d,e,f,g,h) do
{:erlang.list_to_integer([a,b,c,d,e,f,g,h],16), tail}
end
defp hex_e([?{,a,b,c,d,e,f,g,?}|tail]) when LexChar.hex7(a,b,c,d,e,f,g) do
{:erlang.list_to_integer([a,b,c,d,e,f,g],16), tail}
end
defp hex_e([?{,a,b,c,d,e,f,?}|tail]) when LexChar.hex6(a,b,c,d,e,f) do
{:erlang.list_to_integer([a,b,c,d,e,f],16), tail}
end
defp hex_e([?{,a,b,c,d,e,?}|tail]) when LexChar.hex5(a,b,c,d,e) do
{:erlang.list_to_integer([a,b,c,d,e],16), tail}
end
defp hex_e([?{,a,b,c,d,?}|tail]) when LexChar.hex4(a,b,c,d) do
{:erlang.list_to_integer([a,b,c,d],16), tail}
end
defp hex_e([?{,a,b,c,?}|tail]) when LexChar.hex3(a,b,c) do
{:erlang.list_to_integer([a,b,c],16), tail}
end
defp hex_e([?{,a,b,?}|tail]) when LexChar.hex2(a,b) do
{:erlang.list_to_integer([a,b], 16), tail}
end
defp hex_e([?{,a,?}|tail]) when LexChar.is_hex(a) do
{:erlang.list_to_integer([a], 16), tail}
end
defp hex_e([a,b|tail]) when LexChar.hex2(a,b) do
{:erlang.list_to_integer([a,b], 16), tail}
end
defp hex_e([a|tail]) when LexChar.is_hex(a) do
{:erlang.list_to_integer([a], 16), tail}
end
defp hex_e(_) do
{:error, "Invalid hex escape"}
end
#*-------------------------------------------------------------------------------------------\
#| Check for a valid octal character escape
#*-------------------------------------------------------------------------------------------/
defp oct_e([a,b,c|tail]) when LexChar.is_octal(a) and LexChar.is_octal(b) and LexChar.is_octal(c) do
{:erlang.list_to_integer([a,b,c],8), tail}
end
defp oct_e([a,b|tail]) when LexChar.is_octal(a) and LexChar.is_octal(b) do
{:erlang.list_to_integer([a,b],8), tail}
end
defp oct_e([a|tail]) when LexChar.is_octal(a) do
{:erlang.list_to_integer([a],8), tail}
end
defp oct_e(_), do: {:error, "Invalid octal escape"}
#*-------------------------------------------------------------------------------------------\
#| Check for a valid unicode character escape
#*-------------------------------------------------------------------------------------------/
defp uni_e([a,b,c,d|tail]) when LexChar.hex4(a,b,c,d) do
{:erlang.list_to_integer([a,b,c,d], 16), tail}
end
defp uni_e(_), do: {:error, "Invalid unicode escape sequence"}
#*-------------------------------------------------------------------------------------------\
#| Check for a valid long unicode character escape
#*-------------------------------------------------------------------------------------------/
defp big_uni_e([a,b,c,d,e,f,g,h|tail]) when LexChar.hex8(a,b,c,d,e,f,g,h) do
{:erlang.list_to_integer([a,b,c,d,e,f,g,h], 16), tail}
end
defp big_uni_e(_), do: {:error, "Invalid unicode escape sequence"}
end
defmodule PASM do
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment