Last active
January 17, 2018 15:05
-
-
Save wqweto/43796a5d06ad9d7953bcc1825e39a806 to your computer and use it in GitHub Desktop.
Battle of the slowest - markdown parser 2.0
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | |
-- lm_main.lua - a simple markdown to html converter with some twists | |
-- | |
-- original battle at http://forums.bgdev.org/index.php?showtopic=49993 | |
-- | |
local lpeg = require("lpeg") | |
local re = require("lpeg.re") | |
local esc_pattern = lpeg.Cs((lpeg.P"<"/"<" + lpeg.P">"/">" + lpeg.P"&"/"&" + 1)^0) | |
local function html_escape(text) | |
if string.find(text, '<') or string.find(text, '>') or string.find(text, '&') then | |
return esc_pattern:match(text) | |
end | |
return text | |
end | |
local toc = {} | |
local links = {} | |
local block_pattern = [=[ | |
document <- {| block+ |} <empty_line>* !. | |
block <- <empty_line>* (<heading> / <link_def> / <paragraph>) | |
heading <- ({ '#' '#'^-5 } !'#' %ws { (!%nl .)* } %nl?) -> heading | |
link_def <- ('[' { (!']' .)* } ']' ':'? %ws { (!%nl .)* } %nl?) -> link_def | |
paragraph <- { (%nl? !<empty_line> !<link_def> !<heading> (!%nl .)+)+ } | |
empty_line <- ((!%nl %s)* %nl) / (%ws+ !.) | |
]=] | |
local block_defs = { | |
ws = lpeg.S" \t", | |
heading = function(level, text) | |
toc[#toc + 1] = { tag = "head", level = #level, id = #toc + 1, text } | |
return toc[#toc] | |
end, | |
link_def = function(id, href) links[id] = href end, | |
} | |
local inline_pattern = [=[ | |
toplevel <- {~ (%s* <text> / <text>? %s+ / .)* ~} | |
text <- <bold> / <italic> / <link> / ((!([*_] ws) %S)+ -> html_escape) | |
bold <- ('*' !%s {~ (%s* <text>)+ ~} '*' &ws) -> bold | |
italic <- ('_' !%s {~ (%s* <text>)+ ~} '_' &ws) -> italic | |
link <- ('[' { (!']' .)+ => is_link } ']') -> link | |
ws <- %s / !. | |
]=] | |
local inline_defs = { | |
bold = function(text) return "<b>" .. text .. "</b>" end, | |
italic = function(text) return "<i>" .. text .. "</i>" end, | |
html_escape = html_escape, | |
is_link = function(src, pos, text) return links[text] ~= nil end, | |
link = function(text) return string.format("<a href='%s'>%s</a>", links[text], html_escape(text)) end | |
} | |
local function subst(openp, repl, endp) | |
openp = lpeg.P(openp) | |
endp = endp and lpeg.P(endp) or openp | |
local contents = lpeg.C((1 - endp)^1) | |
local patt = openp * contents * endp | |
if repl then patt = patt/repl end | |
return patt | |
end | |
local link_pattern = subst("[", nil, "]") / function(text) return links[text] and string.format("<a href='%s'>%s</a>", links[text], text) or "["..text.."]" end | |
local fast_inline_pattern = lpeg.Cs((subst("*", "<b>%1</b>") + subst("_", "<i>%1</i>") + link_pattern + 1)^1) | |
local function main(arg) | |
local start_clock = os.clock() | |
local usage = string.format([[ | |
usage: %s [input.md] [output.html] | |
where input and output defaults to console | |
]], arg[0]) | |
local block_grammar = re.compile(block_pattern, block_defs) | |
local inline_grammar = re.compile(inline_pattern, inline_defs) | |
local inline_find = lpeg.S("*_<>&[") | |
inline_find = (1 - inline_find)^0 * inline_find | |
-- read input | |
local inp = #arg < 1 and io.stdin or io.open(arg[1], "rt") | |
if inp == nil then | |
io.stderr:write(usage .. string.format("ERROR: %s not found\n", arg[1])) | |
os.exit(1) | |
end | |
local markdown = inp:read("*all") | |
inp:close() | |
io.stderr:write(string.format("%.3f: after read input\n", os.clock() - start_clock)) | |
-- parse markdown in blocks (headings & paragraphs) while collecting link defs | |
local blocks = block_grammar:match(markdown) | |
io.stderr:write(string.format("%.3f: after parse blocks\n", os.clock() - start_clock)) | |
-- dump html prolog | |
local out = #arg < 2 and io.stdout or io.open(arg[2], "wt") | |
if out == nil then | |
io.stderr:write(usage .. string.format("ERROR: cannot create %s\n", arg[2])) | |
os.exit(2) | |
end | |
out:write("<html><head></head><body>\n") | |
-- dump table-of-contents as nested ol's | |
local level = 0 | |
for _, v in ipairs(toc) do | |
while level ~= v.level do | |
local tag = level < v.level and "<ol>" or "</ol>" | |
out:write(string.rep(" ", level - (level < v.level and 0 or 1)) .. tag .. "\n") | |
level = level + (level < v.level and 1 or -1) | |
end | |
out:write(string.format("%s<li><a id='_%d' href='#%d'>%s</a></li>\n", | |
string.rep(" ", v.level), v.id, v.id, html_escape(v[1]))) | |
end | |
while level > 0 do | |
out:write(string.rep(" ", level - 1) .. "</ol>\n") | |
level = level - 1 | |
end | |
io.stderr:write(string.format("%.3f: after dump TOC\n", os.clock() - start_clock)) | |
-- dump blocks and replace inline elements (bold, italic, links) | |
for _, v in ipairs(blocks) do | |
if type(v) == "table" then | |
if v.tag == "head" then | |
out:write(string.format("<h%d id='%d'>%s <a href='#_%d'>[^]</a></h%d>\n", | |
v.level, v.id, html_escape(v[1]), v.id, v.level)) | |
end | |
else | |
if inline_find:match(v) then | |
--out:write("<p>"..inline_grammar:match(v).."</p>\n") | |
out:write("<p>"..fast_inline_pattern:match(html_escape(v)).."</p>\n") | |
else | |
out:write("<p>"..html_escape(v).."</p>\n") | |
end | |
end | |
end | |
io.stderr:write(string.format("%.3f: after dump blocks\n", os.clock() - start_clock)) | |
-- dump html epilog | |
out:write("</body></html>") | |
--out:close() | |
io.stderr:write(string.format("%.3f: done\n", os.clock() - start_clock)) | |
end | |
main(arg) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment