Created
January 6, 2012 19:25
-
-
Save starwing/1572004 to your computer and use it in GitHub Desktop.
a module using lpeg to strip comments and spaces in Lua code.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--[[ | |
= ABOUT | |
This module uses Roberto Ierusalimschy's powerful new pattern matching library | |
LPeg[1] to tokenize Lua source-code in to a table of tokens. I think it handles | |
all of Lua's syntax, but if you find anything missing I would appreciate a mail | |
at [email protected]. This lexer is based on the BNF[2] from the Lua manual. | |
= USAGE | |
I've saved my copy of this module under [$LUA_PATH/lexers/lua.lua] which means | |
I can use it like in the following interactive prompt: | |
Lua 5.1.1 Copyright (C) 1994-2006 Lua.org, PUC-Rio | |
> require 'lexers.lua' | |
> tokens = lexers.lua [=[ | |
>> 42 or 0 | |
>> -- some Lua source-code in a string]=] | |
> = tokens | |
table: 00422E40 | |
> lexers.lua.print(tokens) | |
line 1, number: `42` | |
line 1, whitespace: ` ` | |
line 1, keyword: `or` | |
line 1, whitespace: ` ` | |
line 1, number: `0` | |
line 1, whitespace: ` | |
` | |
line 2, comment: `-- some Lua source-code in a string` | |
total of 7 tokens, 2 lines | |
The returned table [tokens] looks like this: | |
{ | |
-- type , text, line | |
{ 'number' , '42', 1 }, | |
{ 'whitespace', ' ' , 1 }, | |
{ 'keyword' , 'or', 1 }, | |
{ 'whitespace', ' ' , 1 }, | |
{ 'number' , '0' , 1 }, | |
{ 'whitespace', '\n', 1 }, | |
{ 'comment' , '-- some Lua source-code in a string', 2 }, | |
} | |
= CREDITS | |
Written by Peter Odding, 2007/04/04 | |
= THANKS TO | |
- the Lua authors for a wonderful language; | |
- Roberto for LPeg; | |
- caffeine for keeping me awake :) | |
= LICENSE | |
Shamelessly ripped from the SQLite[3] project: | |
The author disclaims copyright to this source code. In place of a legal | |
notice, here is a blessing: | |
May you do good and not evil. | |
May you find forgiveness for yourself and forgive others. | |
May you share freely, never taking more than you give. | |
[1] http://www.inf.puc-rio.br/~roberto/lpeg.html | |
[2] http://lua.org/manual/5.1/manual.html#8 | |
[3] http://sqlite.org | |
--]] | |
-- since this module is intended to be loaded with require() we receive the | |
-- name used to load us in ... and pass it on to module() | |
-- written for LPeg .5, by the way | |
local lpeg = require 'lpeg' | |
local P, R, S, C, Cc, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Ct | |
-- create a pattern which captures the lua value [id] and the input matching | |
-- [patt] in a table | |
local function token(id, patt) return Ct(Cc(id) * C(patt)) end | |
local digit = R('09') | |
-- range of valid characters after first character of identifier | |
local idsafe = R('AZ', 'az', '\127\255') + P '_' | |
-- operators | |
local operator = token('operator', P '==' + P '~=' + P '<=' + P '>=' + P '...' | |
+ P '..' + S '+-*/%^#=<>;:,.{}[]()') | |
-- identifiers | |
local ident = token('identifier', idsafe * (idsafe + digit + P '.') ^ 0) | |
-- keywords | |
local keyword = token('keyword', (P 'and' + P 'break' + P 'do' + P 'else' + | |
P 'elseif' + P 'end' + P 'false' + P 'for' + P 'function' + P 'if' + | |
P 'in' + P 'local' + P 'nil' + P 'not' + P 'or' + P 'repeat' + P 'return' + | |
P 'then' + P 'true' + P 'until' + P 'while') * -(idsafe + digit)) | |
-- numbers | |
local number_sign = S'+-'^-1 | |
local number_decimal = digit ^ 1 | |
local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1 | |
local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) * | |
(S'eE' * number_sign * digit^1)^-1 | |
local number = token('number', number_hexadecimal + | |
number_float + | |
number_decimal) | |
-- callback for [=[ long strings ]=] | |
-- ps. LPeg is for Lua what regex is for Perl, which makes me smile :) | |
local longstring = #(P '[[' + (P '[' * P '=' ^ 0 * P '[')) | |
local longstring = longstring * P(function(input, index) | |
local level = input:match('^%[(=*)%[', index) | |
if level then | |
local _, stop = input:find(']' .. level .. ']', index, true) | |
if stop then return stop + 1 end | |
end | |
end) | |
-- strings | |
local singlequoted_string = P "'" * ((1 - S "'\r\n\f\\") + (P '\\' * 1)) ^ 0 * "'" | |
local doublequoted_string = P '"' * ((1 - S '"\r\n\f\\') + (P '\\' * 1)) ^ 0 * '"' | |
local string = token('string', singlequoted_string + | |
doublequoted_string + | |
longstring) | |
-- comments | |
local singleline_comment = P '--' * (1 - S '\r\n\f') ^ 0 | |
local multiline_comment = P '--' * longstring | |
local comment = token('comment', multiline_comment + singleline_comment) | |
-- whitespace | |
local whitespace = token('whitespace', S('\r\n\f\t ')^1) | |
-- ordered choice of all tokens and last-resort error which consumes one character | |
local any_token = whitespace + number + keyword + ident + | |
string + comment + operator + token('error', 1) | |
-- private interface | |
local table_of_tokens = Ct(any_token ^ 0) | |
-- increment [line] by the number of line-ends in [text] | |
local function sync(line, text) | |
local index, limit = 1, #text | |
while index <= limit do | |
local start, stop = text:find('\r\n', index, true) | |
if not start then | |
start, stop = text:find('[\r\n\f]', index) | |
if not start then break end | |
end | |
index = stop + 1 | |
line = line + 1 | |
end | |
return line | |
end | |
-- we only need to synchronize the line-counter for these token types | |
local multiline_tokens = { comment = true, string = true, whitespace = true } | |
-- public interface | |
local function lexer(input) | |
assert(type(input) == 'string', 'bad argument #1 (expected string)') | |
local line = 1 | |
local tokens = lpeg.match(table_of_tokens, input) | |
for i, token in pairs(tokens) do | |
token[3] = line | |
if multiline_tokens[token[1]] then line = sync(line, token[2]) end | |
end | |
return tokens | |
end | |
-- strip interface | |
local function strip(s) | |
local tokens = lexer(s) | |
local line_head = true | |
local prev_space = false | |
local line = {} | |
for i, v in ipairs(tokens) do | |
if v[1] == 'comment' or v[1] == 'whitespace' then | |
if v[2]:match "\n" then | |
line[#line+1] = v[2]:gsub(".-\n[^\n]*", "\n") | |
line_head = true | |
end | |
prev_space = true | |
else | |
if prev_space and not line_head then | |
line[#line+1] = " " | |
end | |
line[#line+1] = v[2] | |
prev_space = false | |
line_head = false | |
end | |
end | |
return table.concat(line) | |
end | |
local function tocstring(s, wide) | |
local texts = {} | |
local line = {} | |
local curwide = 0 | |
wide = wide - 6 -- indent and quote | |
for s in s:gmatch "." do | |
if s == '\n' then | |
line[#line+1] = '\\n' | |
curwide = curwide + 2 | |
elseif s :match '["\\]' then | |
line[#line+1] = '\\'..s | |
curwide = curwide + 2 | |
elseif s :match '[\0-\31\128-\255]' then | |
line[#line+1] = ("\\x%02X"):format(string.byte(s)) | |
curwide = curwide + 4 | |
else | |
line[#line+1] = s | |
curwide = curwide + 1 | |
end | |
if curwide >= wide then | |
texts[#texts+1] = ' "' .. table.concat(line) .. '"\n' | |
line = {} | |
curwide = 0 | |
end | |
end | |
if #line ~= 0 then | |
texts[#texts+1] = ' "' .. table.concat(line) .. '"\n' | |
end | |
return table.concat(texts) | |
end | |
return { | |
lexer = lexer, | |
strip = strip, | |
tocstring = tocstring, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment