-
-
Save flamendless/b46664758bc0b96800ad5d0e137a1302 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- these have to be sorted by longest first | |
local keywords = { | |
"function", | |
"then", | |
"end", | |
"if", | |
} | |
local symbols = { | |
"~=", | |
"==", | |
"(", | |
")", | |
",", | |
} | |
local function is_number(char) | |
return char:byte() >= 48 and char:byte() <= 71 | |
end | |
local function is_identifier(char) | |
return char:byte() >= 65 and char:byte() <= 122 or char == "_" | |
end | |
local function is_space(char) | |
return char == " " or char == "\t" or char == "\r" or char == "\n" | |
end | |
function tokenize(code) | |
local pos = 1 | |
local function advance(num) | |
pos = pos + num | |
end | |
local function read() | |
local char = code:sub(pos, pos) | |
advance(1) | |
return char | |
end | |
local function get_range(start, stop) | |
return code:sub(pos + start, pos + stop) | |
end | |
local function get_range_abs(start, stop) | |
return code:sub(start, stop) | |
end | |
local function get_char(offset) | |
offset = offset or 0 | |
return code:sub(pos + offset, pos + offset) | |
end | |
local tokens = {} | |
for _ = 1, #code do | |
if get_char() == "" then break end | |
local found = false | |
for _, val in ipairs(keywords) do | |
if get_range(0, #val-1) == val then | |
table.insert(tokens, {type = "keyword", value = val}) | |
advance(#val) | |
found = true | |
break | |
end | |
end | |
for _, val in ipairs(symbols) do | |
if get_range(0, #val-1) == val then | |
table.insert(tokens, {type = "symbol", value = val}) | |
advance(#val) | |
found = true | |
break | |
end | |
end | |
if not found then | |
local char = get_char() | |
if is_identifier(char) then | |
local start = pos | |
for _ = 1, #code do | |
local char = get_char() | |
if is_identifier(char) then | |
advance(1) | |
else | |
table.insert(tokens, {type = "letter", value = get_range_abs(start, pos-1)}) | |
break | |
end | |
end | |
elseif is_space(char) then | |
local start = pos | |
for _ = 1, #code do | |
local char = get_char() | |
if is_space(char) then | |
advance(1) | |
else | |
table.insert(tokens, {type = "space", value = get_range_abs(start, pos-1)}) | |
break | |
end | |
end | |
elseif char == "-" and get_range(1, 1) == "-" then | |
local start = pos | |
for _ = 1, #code do | |
local char = get_char() | |
if char ~= "\n" then | |
advance(1) | |
else | |
table.insert(tokens, {type = "line_comment", value = get_range_abs(start, pos-1)}) | |
break | |
end | |
end | |
elseif char == "\"" then | |
local start = pos | |
advance(1) -- skip the first quote | |
for _ = 1, #code do | |
local char = get_char() | |
if char ~= "\"" then | |
advance(1) | |
else | |
table.insert(tokens, {type = "string", value = get_range_abs(start, pos)}) | |
advance(1) | |
break | |
end | |
end | |
elseif is_number(char) then | |
local start = pos | |
advance(1) -- skip the first quote | |
for _ = 1, #code do | |
local char = get_char() | |
if is_number(char) or char == "." then | |
advance(1) | |
else | |
table.insert(tokens, {type = "number", value = get_range_abs(start, pos-1)}) | |
break | |
end | |
end | |
else | |
return nil, "unxpected character '" .. char .. "' at position " .. pos | |
end | |
end | |
end | |
return tokens | |
end | |
local code = [[ | |
function main(a,b,c) | |
-- prints hello world to stdout | |
print("hello world!") | |
if true ~= false and 1234 then | |
print("aaa") | |
end | |
end | |
]] | |
local tokens = assert(tokenize(code)) | |
local new_code = "" | |
for pos,v in ipairs(tokens) do | |
print(pos,v.type, v.value) | |
new_code = new_code .. v.value | |
end | |
assert(code == new_code) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment