starwing · January 6, 2012 19:25
diff --git a/strip.lua b/strip.lua
 --[[

 = ABOUT
 This module uses Roberto Ierusalimschy's powerful new pattern matching library
 LPeg[1] to tokenize Lua source-code in to a table of tokens. I think it handles
 all of Lua's syntax, but if you find anything missing I would appreciate a mail
 at [email protected]. This lexer is based on the BNF[2] from the Lua manual.

 = USAGE
 I've saved my copy of this module under [$LUA_PATH/lexers/lua.lua] which means
 I can use it like in the following interactive prompt:

   Lua 5.1.1  Copyright (C) 1994-2006 Lua.org, PUC-Rio
   > require 'lexers.lua'
   > tokens = lexers.lua [=[
   >> 42 or 0
   >> -- some Lua source-code in a string]=]
   > = tokens
   table: 00422E40
   > lexers.lua.print(tokens)
   line 1, number: `42`
   line 1, whitespace: ` `
   line 1, keyword: `or`
   line 1, whitespace: ` `
   line 1, number: `0`
   line 1, whitespace: `
   `
   line 2, comment: `-- some Lua source-code in a string`
   total of 7 tokens, 2 lines

 The returned table [tokens] looks like this:

 {
   -- type       , text, line
   { 'number'    , '42', 1 },
   { 'whitespace', ' ' , 1 },
   { 'keyword'   , 'or', 1 },
   { 'whitespace', ' ' , 1 },
   { 'number'    , '0' , 1 },
   { 'whitespace', '\n', 1 },
   { 'comment'   , '-- some Lua source-code in a string', 2 },
 }

 = CREDITS
 Written by Peter Odding, 2007/04/04

 = THANKS TO
 - the Lua authors for a wonderful language;
 - Roberto for LPeg;
 - caffeine for keeping me awake :)

 = LICENSE
 Shamelessly ripped from the SQLite[3] project:

   The author disclaims copyright to this source code.  In place of a legal
   notice, here is a blessing:

      May you do good and not evil.
      May you find forgiveness for yourself and forgive others.
      May you share freely, never taking more than you give.

 [1] http://www.inf.puc-rio.br/~roberto/lpeg.html
 [2] http://lua.org/manual/5.1/manual.html#8
 [3] http://sqlite.org

 --]]

 -- since this module is intended to be loaded with require() we receive the
 -- name used to load us in ... and pass it on to module()

 -- written for LPeg .5, by the way
 local lpeg = require 'lpeg'
 local P, R, S, C, Cc, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Ct

 -- create a pattern which captures the lua value [id] and the input matching
 -- [patt] in a table
 local function token(id, patt) return Ct(Cc(id) * C(patt)) end

 local digit = R('09')

 -- range of valid characters after first character of identifier
 local idsafe = R('AZ', 'az', '\127\255') + P '_'

 -- operators
 local operator = token('operator', P '==' + P '~=' + P '<=' + P '>=' + P '...'
                                          + P '..' + S '+-*/%^#=<>;:,.{}[]()')
 -- identifiers
 local ident = token('identifier', idsafe * (idsafe + digit + P '.') ^ 0)

 -- keywords
 local keyword = token('keyword', (P 'and' + P 'break' + P 'do' + P 'else' +
   P 'elseif' + P 'end' + P 'false' + P 'for' + P 'function' + P 'if' +
   P 'in' + P 'local' + P 'nil' + P 'not' + P 'or' + P 'repeat' + P 'return' +
   P 'then' + P 'true' + P 'until' + P 'while') * -(idsafe + digit))

 -- numbers
 local number_sign = S'+-'^-1
 local number_decimal = digit ^ 1
 local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1
 local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) *
                     (S'eE' * number_sign * digit^1)^-1
 local number = token('number', number_hexadecimal +
                               number_float +
                               number_decimal)

 -- callback for [=[ long strings ]=]
 -- ps. LPeg is for Lua what regex is for Perl, which makes me smile :)
 local longstring = #(P '[[' + (P '[' * P '=' ^ 0 * P '['))
 local longstring = longstring * P(function(input, index)
   local level = input:match('^%[(=*)%[', index)
   if level then
      local _, stop = input:find(']' .. level .. ']', index, true)
      if stop then return stop + 1 end
   end
 end)

 -- strings
 local singlequoted_string = P "'" * ((1 - S "'\r\n\f\\") + (P '\\' * 1)) ^ 0 * "'"
 local doublequoted_string = P '"' * ((1 - S '"\r\n\f\\') + (P '\\' * 1)) ^ 0 * '"'
 local string = token('string', singlequoted_string +
                               doublequoted_string +
                               longstring)

 -- comments
 local singleline_comment = P '--' * (1 - S '\r\n\f') ^ 0
 local multiline_comment = P '--' * longstring
 local comment = token('comment', multiline_comment + singleline_comment)

 -- whitespace
 local whitespace = token('whitespace', S('\r\n\f\t ')^1)

 -- ordered choice of all tokens and last-resort error which consumes one character
 local any_token = whitespace + number + keyword + ident +
                  string + comment + operator + token('error', 1)

 -- private interface
 local table_of_tokens = Ct(any_token ^ 0)

 -- increment [line] by the number of line-ends in [text]
 local function sync(line, text)
   local index, limit = 1, #text
   while index <= limit do
      local start, stop = text:find('\r\n', index, true)
      if not start then
         start, stop = text:find('[\r\n\f]', index)
         if not start then break end
      end
      index = stop + 1
      line = line + 1
   end
   return line
 end

 -- we only need to synchronize the line-counter for these token types
 local multiline_tokens = { comment = true, string = true, whitespace = true }

 -- public interface
 local function lexer(input)
   assert(type(input) == 'string', 'bad argument #1 (expected string)')
   local line = 1
   local tokens = lpeg.match(table_of_tokens, input)
   for i, token in pairs(tokens) do
      token[3] = line
      if multiline_tokens[token[1]] then line = sync(line, token[2]) end
   end
   return tokens
 end

 -- strip interface
 local function strip(s)
    local tokens = lexer(s)
    local line_head = true
    local prev_space = false
    local line = {}
    for i, v in ipairs(tokens) do
        if v[1] == 'comment' or v[1] == 'whitespace' then
            if v[2]:match "\n" then
                line[#line+1] = v[2]:gsub(".-\n[^\n]*", "\n")
                line_head = true
            end
            prev_space = true
        else
            if prev_space and not line_head then
                line[#line+1] = " "
            end
            line[#line+1] = v[2]
            prev_space = false
            line_head = false
        end
    end
    return table.concat(line)
 end

 local function tocstring(s, wide)
    local texts = {}
    local line = {}
    local curwide = 0
    wide = wide - 6 -- indent and quote
    for s in s:gmatch "." do
        if s == '\n' then
            line[#line+1] = '\\n'
            curwide = curwide + 2
        elseif s :match '["\\]' then
            line[#line+1] = '\\'..s
            curwide = curwide + 2
        elseif s :match '[\0-\31\128-\255]' then
            line[#line+1] = ("\\x%02X"):format(string.byte(s))
            curwide = curwide + 4
        else
            line[#line+1] = s
            curwide = curwide + 1
        end
        if curwide >= wide then
            texts[#texts+1] = '    "' .. table.concat(line) .. '"\n'
            line = {}
            curwide = 0
        end
     end
     if #line ~= 0 then
         texts[#texts+1] = '    "' .. table.concat(line) .. '"\n'
     end
     return table.concat(texts)
 end

 return {
    lexer = lexer,
    strip = strip,
    tocstring = tocstring,
 }
	--[[

	= ABOUT
	This module uses Roberto Ierusalimschy's powerful new pattern matching library
	LPeg[1] to tokenize Lua source-code in to a table of tokens. I think it handles
	all of Lua's syntax, but if you find anything missing I would appreciate a mail
	at [email protected]. This lexer is based on the BNF[2] from the Lua manual.

	= USAGE
	I've saved my copy of this module under [$LUA_PATH/lexers/lua.lua] which means
	I can use it like in the following interactive prompt:

	Lua 5.1.1 Copyright (C) 1994-2006 Lua.org, PUC-Rio
	> require 'lexers.lua'
	> tokens = lexers.lua [=[
	>> 42 or 0
	>> -- some Lua source-code in a string]=]
	> = tokens
	table: 00422E40
	> lexers.lua.print(tokens)
	line 1, number: `42`
	line 1, whitespace: ` `
	line 1, keyword: `or`
	line 1, whitespace: ` `
	line 1, number: `0`
	line 1, whitespace: `
	`
	line 2, comment: `-- some Lua source-code in a string`
	total of 7 tokens, 2 lines

	The returned table [tokens] looks like this:

	{
	-- type , text, line
	{ 'number' , '42', 1 },
	{ 'whitespace', ' ' , 1 },
	{ 'keyword' , 'or', 1 },
	{ 'whitespace', ' ' , 1 },
	{ 'number' , '0' , 1 },
	{ 'whitespace', '\n', 1 },
	{ 'comment' , '-- some Lua source-code in a string', 2 },
	}

	= CREDITS
	Written by Peter Odding, 2007/04/04

	= THANKS TO
	- the Lua authors for a wonderful language;
	- Roberto for LPeg;
	- caffeine for keeping me awake :)

	= LICENSE
	Shamelessly ripped from the SQLite[3] project:

	The author disclaims copyright to this source code. In place of a legal
	notice, here is a blessing:

	May you do good and not evil.
	May you find forgiveness for yourself and forgive others.
	May you share freely, never taking more than you give.

	[1] http://www.inf.puc-rio.br/~roberto/lpeg.html
	[2] http://lua.org/manual/5.1/manual.html#8
	[3] http://sqlite.org

	--]]

	-- since this module is intended to be loaded with require() we receive the
	-- name used to load us in ... and pass it on to module()

	-- written for LPeg .5, by the way
	local lpeg = require 'lpeg'
	local P, R, S, C, Cc, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Ct

	-- create a pattern which captures the lua value [id] and the input matching
	-- [patt] in a table
	local function token(id, patt) return Ct(Cc(id) * C(patt)) end

	local digit = R('09')

	-- range of valid characters after first character of identifier
	local idsafe = R('AZ', 'az', '\127\255') + P '_'

	-- operators
	local operator = token('operator', P '==' + P '~=' + P '<=' + P '>=' + P '...'
	+ P '..' + S '+-*/%^#=<>;:,.{}[]()')
	-- identifiers
	local ident = token('identifier', idsafe * (idsafe + digit + P '.') ^ 0)

	-- keywords
	local keyword = token('keyword', (P 'and' + P 'break' + P 'do' + P 'else' +
	P 'elseif' + P 'end' + P 'false' + P 'for' + P 'function' + P 'if' +
	P 'in' + P 'local' + P 'nil' + P 'not' + P 'or' + P 'repeat' + P 'return' +
	P 'then' + P 'true' + P 'until' + P 'while') * -(idsafe + digit))

	-- numbers
	local number_sign = S'+-'^-1
	local number_decimal = digit ^ 1
	local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1
	local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) *
	(S'eE' * number_sign * digit^1)^-1
	local number = token('number', number_hexadecimal +
	number_float +
	number_decimal)

	-- callback for [=[ long strings ]=]
	-- ps. LPeg is for Lua what regex is for Perl, which makes me smile :)
	local longstring = #(P '[[' + (P '[' * P '=' ^ 0 * P '['))
	local longstring = longstring * P(function(input, index)
	local level = input:match('^%[(=*)%[', index)
	if level then
	local _, stop = input:find(']' .. level .. ']', index, true)
	if stop then return stop + 1 end
	end
	end)

	-- strings
	local singlequoted_string = P "'" * ((1 - S "'\r\n\f\\") + (P '\\' * 1)) ^ 0 * "'"
	local doublequoted_string = P '"' * ((1 - S '"\r\n\f\\') + (P '\\' * 1)) ^ 0 * '"'
	local string = token('string', singlequoted_string +
	doublequoted_string +
	longstring)

	-- comments
	local singleline_comment = P '--' * (1 - S '\r\n\f') ^ 0
	local multiline_comment = P '--' * longstring
	local comment = token('comment', multiline_comment + singleline_comment)

	-- whitespace
	local whitespace = token('whitespace', S('\r\n\f\t ')^1)

	-- ordered choice of all tokens and last-resort error which consumes one character
	local any_token = whitespace + number + keyword + ident +
	string + comment + operator + token('error', 1)

	-- private interface
	local table_of_tokens = Ct(any_token ^ 0)

	-- increment [line] by the number of line-ends in [text]
	local function sync(line, text)
	local index, limit = 1, #text
	while index <= limit do
	local start, stop = text:find('\r\n', index, true)
	if not start then
	start, stop = text:find('[\r\n\f]', index)
	if not start then break end
	end
	index = stop + 1
	line = line + 1
	end
	return line
	end

	-- we only need to synchronize the line-counter for these token types
	local multiline_tokens = { comment = true, string = true, whitespace = true }

	-- public interface
	local function lexer(input)
	assert(type(input) == 'string', 'bad argument #1 (expected string)')
	local line = 1
	local tokens = lpeg.match(table_of_tokens, input)
	for i, token in pairs(tokens) do
	token[3] = line
	if multiline_tokens[token[1]] then line = sync(line, token[2]) end
	end
	return tokens
	end

	-- strip interface
	local function strip(s)
	local tokens = lexer(s)
	local line_head = true
	local prev_space = false
	local line = {}
	for i, v in ipairs(tokens) do
	if v[1] == 'comment' or v[1] == 'whitespace' then
	if v[2]:match "\n" then
	line[#line+1] = v[2]:gsub(".-\n[^\n]*", "\n")
	line_head = true
	end
	prev_space = true
	else
	if prev_space and not line_head then
	line[#line+1] = " "
	end
	line[#line+1] = v[2]
	prev_space = false
	line_head = false
	end
	end
	return table.concat(line)
	end

	local function tocstring(s, wide)
	local texts = {}
	local line = {}
	local curwide = 0
	wide = wide - 6 -- indent and quote
	for s in s:gmatch "." do
	if s == '\n' then
	line[#line+1] = '\\n'
	curwide = curwide + 2
	elseif s :match '["\\]' then
	line[#line+1] = '\\'..s
	curwide = curwide + 2
	elseif s :match '[\0-\31\128-\255]' then
	line[#line+1] = ("\\x%02X"):format(string.byte(s))
	curwide = curwide + 4
	else
	line[#line+1] = s
	curwide = curwide + 1
	end
	if curwide >= wide then
	texts[#texts+1] = ' "' .. table.concat(line) .. '"\n'
	line = {}
	curwide = 0
	end
	end
	if #line ~= 0 then
	texts[#texts+1] = ' "' .. table.concat(line) .. '"\n'
	end
	return table.concat(texts)
	end

	return {
	lexer = lexer,
	strip = strip,
	tocstring = tocstring,
	}