-
-
Save ninjapanzer/5059661 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule Lexer do | |
defmodule PIR do | |
require LexChar | |
defmacrop line(opts) do | |
quote do: elem(unquote(opts),1) | |
end | |
defmacrop file(opts) do | |
quote do: elem(unquote(opts),2) | |
end | |
defmacrop nl(opts) do | |
quote do: elem(unquote(opts),0) | |
end | |
def tokenize(str) do | |
if is_binary(str) do | |
str = binary_to_list(str) | |
end | |
Enum.reverse tokenize(str, [], {true, 1, "nofile"}) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize with all given options that are avalible. | | |
#| @param str [binary,list] The string to tokenize. | | |
#| @param nl [true,false] Whether the string is the beginning of a new line. | | |
#| @param ln [integer] The line number of the string. | | |
#| @param nm [string] The name of the file that the input is from. | | |
#*-------------------------------------------------------------------------------------------/ | |
def tokenize(str, nl, ln, nm) do | |
Enum.reverse tokenize(str, [], [nl, ln, nm]) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a hexadecimal integer | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([?0, x, head | tail], tokens, opts) when (x == ?x or x == ?X) and LexChar.is_hex(head) do | |
{text, hex_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_hex(c) end, 16, []) | |
tokenize(text, [{:int, line(opts), hex_num} | tokens], {false, line(opts), file(opts)}) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a prefixed hexadecimal integer | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([p, ?0, x, head | tail], tokens, opts) when (p == ?+ or p == ?-) and (x == ?x or x == ?X) and LexChar.is_hex(head) do | |
{text, hex_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_hex(c) end, 16, []) | |
mul = :erlang.list_to_integer([p,?1]) | |
tokenize(text, [{:int, line(opts), mul*hex_num} | tokens], {false, line(opts), file(opts)}) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a binary integer | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([?0, b, head | tail], tokens, opts) when (b == ?b or b == ?B) and LexChar.is_bin(head) do | |
{text, bin_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_bin(c) end, 2, []) | |
tokenize(text, [{:int, line(opts), bin_num} | tokens], {false, line(opts), file(opts)}) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize prefixed binary integer | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([p, ?0, b, head | tail], tokens, opts) when (p == ?+ or p == ?-) and (b == ?b or b == ?B) and LexChar.is_bin(head) do | |
{text, bin_num} = tokenize_int([head|tail], fn(c) -> LexChar.is_bin(c) end, 2, []) | |
mul = :erlang.list_to_integer([p,?1]) | |
tokenize(text, [{:int, line(opts), mul*bin_num} | tokens], {false, line(opts), file(opts)}) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a number | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([head|tail], tokens, opts) when LexChar.is_digit(head) do | |
{text, int} = tokenize_int([head|tail], fn(c) -> LexChar.is_digit(c) end, 10, []) | |
if Enum.first(text) == ?. and LexChar.is_digit(Enum.at!(text,1)) do | |
{text, float} = tokenize_float(Enum.drop(text, 1), int, []) | |
tokenize(text, [{:float, line(opts), float} | tokens], {false, line(opts), file(opts)}) | |
else | |
tokenize(text, [{:int, line(opts), int} | tokens], {false, line(opts), file(opts)}) | |
end | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a newline | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([?\n | tail], tokens, opts) do | |
tokenize(tail, [{:newline, line(opts), nil} | tokens], {true, line(opts) + 1, file(opts)}) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a single quote string | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([?' | tail], tokens, opts) do | |
{text, str} = tokenize_single_string(tail, []) | |
tokenize(text, [{:str, line(opts), str} | tokens], {false, line(opts), file(opts)}) | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a double quote string | | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize([?" | tail], tokens, opts) do | |
{text, str} = tokenize_double_string(tail, []) | |
tokenize(text, [{:str, line(opts), str} | tokens], {false, line(opts), file(opts)}) | |
end | |
# Register | |
defp tokenize([?$, t | tail], tokens, opts) when t == ?S or t == ?N or t == ?P or t == ?I do | |
{text, id} = tokenize_int(tail, fn(c) -> LexChar.is_digit(c) end, 10, []) | |
if !is_integer(id) do | |
{:error, "Invalid register name"} | |
else | |
tokenize(text, [{:reg, line(opts), {list_to_atom(t), id}} | tokens], {false, line(opts), file(opts)}) | |
end | |
end | |
# Heredoc | |
#defp tokenize([?<, ?<, ?" | tail], tokens, opts) do | |
#{text, delimiter} = tokenize_double_string(tail, []) | |
#{text, lines, string} = tokenize_heredoc(delimiter, 0) | |
#tokenize(text, [{:str, line(opts), string}|tokens], {true, line(opts) + lines, file(opts)}) | |
#end | |
# Heredoc | |
# Add a queue so that it isn't handled until the newline is met? | |
# Maybe create a function that inserts it into the place of the token | |
# and create a placeholder within the token array so that it isn't displaced. | |
#defp tokenize([?<, ?<, ?' | tail], tokens, opts) do | |
#{text, delimiter} = tokenize_single_string(tail, []) | |
#{text, lines, string} = tokenize_heredoc(delimiter,0) | |
#tokenize(text, [{:str, line(opts), string}|tokens], {true, line(opts) + lines, file(opts)}) | |
#end | |
# Standard comment | |
defp tokenize([?# | tail], tokens, opts) do | |
tokenize(:lists.dropwhile(fn x -> x != ?\n end, tail), tokens, opts) | |
end | |
#defp tokenize([?=,?b,?e,?g,?i,?n|tail], tokens, opts) when nl(opts) do | |
# POD | |
#end | |
defp tokenize([single_char | tail], tokens, opt) do | |
name = case single_char do | |
?. -> :dot | |
?, -> :comma | |
?; -> :semicolon | |
?< -> :lt | |
?> -> :gt | |
?= -> :assign | |
?- -> :minus | |
?+ -> :plus | |
?% -> :mod | |
?/ -> :divide | |
?* -> :multiply | |
?~ -> :bitnot | |
?! -> :not | |
?| -> :bitor | |
?& -> :bitand | |
?( -> :lparen | |
?) -> :rparen | |
?[ -> :lbracket | |
?] -> :rbracket | |
?{ -> :lbrace | |
?} -> :rbrace | |
_ -> nil | |
end | |
if name == nil do | |
{:error, "Syntax error, unexpected character"} | |
else | |
if nl(opt) do | |
{:error, "Bare Symbols cannot be the start of a line"} | |
else | |
tokenize(tail, [{name, line(opt), nil}|tokens], {false, line(opt), file(opt)}) | |
end | |
end | |
end | |
defp tokenize(_, tokens, _) do | |
tokens | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| | |
#| Support type creation and extension of matching functions | | |
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| | |
#*-------------------------------------------------------------------------------------------/ | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a floating point number | |
#| Formats: | |
#| -> float ::= [:digit:]+ '.' [:digit]+ | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize_float([head|tail], int, acc) when LexChar.is_digit(head) do | |
tokenize_float(tail, int, [head | acc]) | |
end | |
defp tokenize_float(str, int, acc) do | |
{ str, int + (:math.pow(10, -length(acc)) * :erlang.list_to_integer(Enum.reverse acc)) } | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize an integer | |
#| @param verify [fn] The function that verifies the character | |
#| @param base [integer] The base of the integer. | |
#| @param acc [list] The accumulative list of characters in the integer | |
#| Formats: | |
#| -> int ::= [:digit:]+ | |
#| -> hex_int ::= '0' {'x', 'X'} [:xdigit:]+ | |
#| -> bin_int ::= '0' {'b', 'B'} {'0', '1'}+ | |
#| @note This function does not accept these formats with the prefixes, it only accepts | |
#| the suffix with the base supplied. | |
#| @example tokenize_int("DEADBEEF", fn(c) do | |
#| (c >= ?0 and c <= ?9) or | |
#| ((c >= ?a and c <= ?z) or | |
#| (c >= ?A and c <= ?Z)) | |
#| end, 16, []) | |
#| #=> {[], 3735928559} | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize_int([head|tail], verify, base, acc) do | |
if verify.(head) do | |
tokenize_int(tail, verify, base, [head | acc]) | |
else | |
{[head|tail], :erlang.list_to_integer(Enum.reverse(acc), base)} | |
end | |
end | |
defp tokenize_int([], _, base, acc) do | |
{[], :erlang.list_to_integer(Enum.reverse(acc), base)} | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a single quoted string | |
#| Single quoted strings only support escaping "\" and "'" | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize_single_string([?\\, x | tail], acc) do | |
case x do | |
?\\ -> tokenize_single_string(tail, [?\\ | acc]) | |
?' -> tokenize_single_string(tail, [?' | acc]) | |
_ -> {:error, "Unknown character escape \\" ++ x} | |
end | |
end | |
defp tokenize_single_string([?' | tail ], acc) do | |
{tail, Enum.reverse(acc)} | |
end | |
defp tokenize_single_string([head|tail], acc) do | |
tokenize_single_string(tail, [head | acc]) | |
end | |
defp tokenize_single_string([], _) do | |
{:error, "Unterminated string"} | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Tokenize a double quoted string | |
#| Escape Sequences: | |
#| \a -> Bell (Alert) | |
#| \b -> Backspace | |
#| \t -> Tab | |
#| \n -> Newline | |
#| \v -> Vertical Tab | |
#| \f -> Form Feed | |
#| \r -> Return Carriage | |
#| \e -> Escape | |
#| \\ -> Backslash | |
#| \" -> Double Quote | |
#| \xHH -> Hex Character (1..2) Hex Digits) | |
#| \OOO -> Octal Character (1..3) Octal Digits) | |
#| \cX -> Control Character | |
#| \x{h..h} -> Hex Character (1..8 Hex Digits) | |
#| \uhhhh -> Unicode Character (4 Hex Digits) | |
#| \Uhhhhhhhh -> Unicode Character (8 Hex Digits) | |
#*-------------------------------------------------------------------------------------------/ | |
defp tokenize_double_string([?\\, x | tail], acc) do | |
case x do | |
?a -> tokenize_double_string(tail, [?\a | acc]) | |
?b -> tokenize_double_string(tail, [?\b | acc]) | |
?t -> tokenize_double_string(tail, [?\t | acc]) | |
?n -> tokenize_double_string(tail, [?\n | acc]) | |
?v -> tokenize_double_string(tail, [?\v | acc]) | |
?f -> tokenize_double_string(tail, [?\f | acc]) | |
?r -> tokenize_double_string(tail, [?\r | acc]) | |
?e -> tokenize_double_string(tail, [?\e | acc]) | |
?\\ -> tokenize_double_string(tail, [?\\ | acc]) | |
?" -> tokenize_double_string(tail, [?" | acc]) | |
?x -> | |
{ hex, text } = hex_e(tail) | |
if !is_integer(hex) do | |
hex | |
else | |
tokenize_double_string(text, [hex | acc]) | |
end | |
?u -> | |
{ uni, text } = uni_e(tail) | |
if !is_integer(uni) do | |
uni | |
else | |
tokenize_double_string(text, [uni | acc]) | |
end | |
?U -> | |
{ uni, text } = big_uni_e(tail) | |
if !is_integer(uni) do | |
uni | |
else | |
tokenize_double_string(text, [uni | acc]) | |
end | |
_ -> | |
{oct, text} = oct_e(tail) | |
if !is_integer(oct) do | |
oct | |
else | |
tokenize_double_string(text, [oct | acc]) | |
end | |
# ?c -> | |
end | |
end | |
defp tokenize_double_string([?" | tail], acc) do | |
{tail, list_to_binary(Enum.reverse(acc))} | |
end | |
defp tokenize_double_string([head | tail], acc) do | |
tokenize_double_string(tail, [head | acc]) | |
end | |
defp tokenize_double_string([], _) do | |
{:error, "Unterminated string"} | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Check for a valid hexadecimal character escape | |
#*-------------------------------------------------------------------------------------------/ | |
defp hex_e([?{,a,b,c,d,e,f,g,h,?}|tail]) when LexChar.hex8(a,b,c,d,e,f,g,h) do | |
{:erlang.list_to_integer([a,b,c,d,e,f,g,h],16), tail} | |
end | |
defp hex_e([?{,a,b,c,d,e,f,g,?}|tail]) when LexChar.hex7(a,b,c,d,e,f,g) do | |
{:erlang.list_to_integer([a,b,c,d,e,f,g],16), tail} | |
end | |
defp hex_e([?{,a,b,c,d,e,f,?}|tail]) when LexChar.hex6(a,b,c,d,e,f) do | |
{:erlang.list_to_integer([a,b,c,d,e,f],16), tail} | |
end | |
defp hex_e([?{,a,b,c,d,e,?}|tail]) when LexChar.hex5(a,b,c,d,e) do | |
{:erlang.list_to_integer([a,b,c,d,e],16), tail} | |
end | |
defp hex_e([?{,a,b,c,d,?}|tail]) when LexChar.hex4(a,b,c,d) do | |
{:erlang.list_to_integer([a,b,c,d],16), tail} | |
end | |
defp hex_e([?{,a,b,c,?}|tail]) when LexChar.hex3(a,b,c) do | |
{:erlang.list_to_integer([a,b,c],16), tail} | |
end | |
defp hex_e([?{,a,b,?}|tail]) when LexChar.hex2(a,b) do | |
{:erlang.list_to_integer([a,b], 16), tail} | |
end | |
defp hex_e([?{,a,?}|tail]) when LexChar.is_hex(a) do | |
{:erlang.list_to_integer([a], 16), tail} | |
end | |
defp hex_e([a,b|tail]) when LexChar.hex2(a,b) do | |
{:erlang.list_to_integer([a,b], 16), tail} | |
end | |
defp hex_e([a|tail]) when LexChar.is_hex(a) do | |
{:erlang.list_to_integer([a], 16), tail} | |
end | |
defp hex_e(_) do | |
{:error, "Invalid hex escape"} | |
end | |
#*-------------------------------------------------------------------------------------------\ | |
#| Check for a valid octal character escape | |
#*-------------------------------------------------------------------------------------------/ | |
defp oct_e([a,b,c|tail]) when LexChar.is_octal(a) and LexChar.is_octal(b) and LexChar.is_octal(c) do | |
{:erlang.list_to_integer([a,b,c],8), tail} | |
end | |
defp oct_e([a,b|tail]) when LexChar.is_octal(a) and LexChar.is_octal(b) do | |
{:erlang.list_to_integer([a,b],8), tail} | |
end | |
defp oct_e([a|tail]) when LexChar.is_octal(a) do | |
{:erlang.list_to_integer([a],8), tail} | |
end | |
defp oct_e(_), do: {:error, "Invalid octal escape"} | |
#*-------------------------------------------------------------------------------------------\ | |
#| Check for a valid unicode character escape | |
#*-------------------------------------------------------------------------------------------/ | |
defp uni_e([a,b,c,d|tail]) when LexChar.hex4(a,b,c,d) do | |
{:erlang.list_to_integer([a,b,c,d], 16), tail} | |
end | |
defp uni_e(_), do: {:error, "Invalid unicode escape sequence"} | |
#*-------------------------------------------------------------------------------------------\ | |
#| Check for a valid long unicode character escape | |
#*-------------------------------------------------------------------------------------------/ | |
defp big_uni_e([a,b,c,d,e,f,g,h|tail]) when LexChar.hex8(a,b,c,d,e,f,g,h) do | |
{:erlang.list_to_integer([a,b,c,d,e,f,g,h], 16), tail} | |
end | |
defp big_uni_e(_), do: {:error, "Invalid unicode escape sequence"} | |
end | |
defmodule PASM do | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment