Created
May 27, 2011 07:24
-
-
Save fritzsche/994796 to your computer and use it in GitHub Desktop.
Ruby Textile Parser (work in progress)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "pp" | |
require "test/unit" | |
require "cgi" | |
# (C) Thomas Fritzsche May, 2011 | |
module Parser | |
class Match | |
attr_accessor :token,:name,:children | |
def initialize( token="", children = nil , name = nil) | |
@token = token | |
@children = children | |
@name = name | |
end | |
def compact | |
# leaves of the tree | |
if children.nil? || children.length == 0 | |
return name.nil? ? token : { name => token } | |
end | |
# compact sub-tree | |
comp_list = children.collect { |c| c.compact } | |
comp_list.compact! | |
if comp_list.length == 1 | |
return name.nil? ? comp_list[0] : { name => comp_list[0]} | |
end | |
if comp_list.any?{|l| l.instance_of?(Hash) or l.instance_of?(Array) } | |
comp_list = comp_list.delete_if do | |
|l| not (l.instance_of?(Hash) or l.instance_of?(Array)) | |
end | |
if comp_list.length == 1 | |
return name.nil? ? comp_list[0] : { name => comp_list[0]} | |
end | |
return name.nil? ? comp_list : { name => comp_list } | |
else | |
return name.nil? ? comp_list.join : { name => comp_list.join } | |
end | |
end | |
end | |
def match(reg,&block) | |
if reg.class == Symbol | |
lambda{ |str| | |
return nil if block && block.call == false | |
self.send(reg,str) | |
} | |
else | |
lambda{ |str| | |
return nil if block && block.call == false | |
if (str=~reg) == 0 | |
Match.new $~.to_s | |
else | |
return nil | |
end | |
} | |
end | |
end | |
def seq(*lb,&block) | |
lambda{ |str| | |
return nil if block && block.call == false | |
tmp = str | |
children = [] | |
token = "" | |
result = lb.each{ |l| | |
match = l.call(tmp) | |
break nil if match.nil? | |
children << match | |
token += match.token | |
tmp = str[token.length..-1] | |
} | |
return nil if result == nil | |
Match.new(token,children) | |
} | |
end | |
def choice(*lb,&block) | |
lambda{ |str| | |
return nil if block && block.call == false | |
match = nil | |
lb.each{ |l| | |
match = l.call(str) | |
break unless match.nil? | |
} | |
return match | |
} | |
end | |
def repeat(rep,min=0,max=nil,&block) | |
lambda{ |str| | |
return nil if block && block.call == false | |
if str.nil? || str.length == 0 | |
if min == 0 | |
return Match.new("") | |
else | |
return nil | |
end | |
end | |
tmp = str | |
n = 0 | |
children = [] | |
token = "" | |
loop do | |
match = rep.call(tmp) | |
break if match.nil? | |
token += match.token | |
children << match | |
n = n+1 | |
break if(!max.nil? && max <= n) | |
tmp = str[token.length..-1] | |
end | |
if ( min != 0 && n < min ) || ( max != nil && n > max) | |
return nil | |
end | |
Match.new(token,children) | |
} | |
end | |
def present?(exp,&block) | |
lambda{ |str| | |
return nil if block && block.call == false | |
match = exp.call(str) | |
return nil if match.nil? | |
match.token = "" | |
return match | |
} | |
end | |
def absent?(exp,&block) | |
lambda{ |str| | |
return nil if block && block.call == false | |
match = exp.call(str) | |
return nil unless match.nil? | |
return Match.new("") | |
} | |
end | |
def as(name,exp,&block) | |
lambda{ |str| | |
return nil if block && block.call == false | |
match = exp.call(str) | |
return nil if match.nil? | |
match.name = name | |
return match | |
} | |
end | |
def matched?(pr,&block) | |
lambda{ |str| | |
m = pr.call(str) | |
block.call(m) if block | |
m | |
} | |
end | |
def eof(&block) | |
lambda{ |str| | |
return nil if block && block.call == false | |
return Match.new if str.nil? or str == "" | |
return nil | |
} | |
end | |
def transform(ast,trans = self) | |
if ast.instance_of?(String) | |
return ast | |
end | |
if ast.instance_of?(Hash) | |
list = [] | |
ast.each_pair{|key,value| | |
before_method = ("before_"+key.to_s).to_sym | |
if trans.respond_to?(before_method) | |
trans.send(before_method,ast) | |
end | |
val_str = transform(value,trans) | |
if trans.respond_to?(key) | |
no_par = trans.method(key).arity | |
if no_par == 2 | |
list << trans.send(key,val_str,ast) | |
else | |
if no_par == 1 | |
list << trans.send(key,val_str) | |
else | |
list << trans.send(key) | |
end | |
end | |
end | |
} | |
return list.join | |
end | |
if ast.instance_of?(Array) | |
list = ast.collect{|value| | |
val_str = transform(value,trans) | |
} | |
return list.join | |
end | |
end | |
end | |
#=============================================== | |
class TextileScanner | |
class HtmlCoverter | |
module HtmlCoverterModule | |
def block(str,ast) | |
begin | |
block_modifier = ast[:block].find{|h| h[:block_modifier]}[:block_modifier] | |
rescue | |
block_modifier = "p" | |
end | |
case block_modifier | |
when "bq" | |
block_modifier = "blockquote" | |
end | |
"<#{block_modifier}>#{str}</#{block_modifier}>" | |
end | |
def block_modifier | |
"" | |
end | |
def newline | |
"<br />" | |
end | |
def em_dash | |
"—" | |
end | |
def en_dash | |
" – " | |
end | |
def ellipsis | |
"…" | |
end | |
def copyright | |
"©" | |
end | |
def trademark | |
"™" | |
end | |
def registered | |
"®" | |
end | |
def dimension | |
"×" | |
end | |
def inline_text(str) | |
str | |
end | |
def strong_phrase(str) | |
"<strong>#{str}</strong>" | |
end | |
def emphasis_phrase(str) | |
"<em>#{str}</em>" | |
end | |
def italic_phrase(str) | |
"<i>#{str}</i>" | |
end | |
def inline_char(str) | |
str | |
end | |
def escape(str) | |
CGI.escapeHTML(str) | |
end | |
def before_bullet_list(ast) | |
@@list_tag = 'ul' | |
@@list_depth = 0 | |
end | |
def bullet_list(str) | |
return str+"</li></#{@@list_tag}>"*@@list_depth | |
end | |
def bullet(str) | |
@@list_delta = str.length - @@list_depth | |
@@list_depth = str.length | |
"" | |
end | |
def before_ordered_list(ast) | |
@@list_tag = 'ol' | |
@@list_depth = 0 | |
end | |
def ordered_list(str) | |
return str+"</li></#{@@list_tag}>"*@@list_depth | |
end | |
def list_item(str) | |
if @@list_delta > 0 | |
result = "<#{@@list_tag}><li>"*@@list_delta+str | |
end | |
if @@list_delta == 0 | |
result = "</li><li>#{str}" | |
end | |
if @@list_delta < 0 | |
result = "</li>"+("</#{@@list_tag}></li>"*(@@list_delta * -1))+"<li>"+str | |
end | |
result | |
end | |
end | |
extend HtmlCoverterModule | |
end | |
extend Parser | |
NEWLINE = match(/\n/) | |
DOUBLE_NEWLINE = repeat(NEWLINE,2,2) | |
SPACE = match(/\s/) | |
DOT = match(/\./) | |
DOUBLE_DOT = match(/\.\./) | |
BULLET = as(:bullet,repeat(match(/\*/),1)) | |
COUNTER = as(:bullet,repeat(match(/\#/),1)) | |
HEADER_BLOCK = match(/h[1-6]/) | |
PARAGRAPH_BLOCK = match(/p/) | |
BLOCKQUOTE = as(:blockquote, match(/bq/)) | |
BLOCK_TAG = choice(BLOCKQUOTE, HEADER_BLOCK, PARAGRAPH_BLOCK) | |
INLINE_CHAR_END = absent?(choice(repeat(NEWLINE,2,2){ @@extended ? false : true }, | |
seq(DOUBLE_NEWLINE,BLOCK_TAG,DOT,repeat(SPACE,1)){@@extended ? true : false })) | |
INLINE_CHAR = seq(INLINE_CHAR_END,choice(as(:escape,match(/./)),seq(as(:newline,NEWLINE),absent?(choice(BULLET,COUNTER))))) | |
STRONG_PHRASE = seq(match(/\*/),as(:strong_phrase,repeat(seq(absent?(match(/\*/)),match(:inline_element)))),match(/\*/)) | |
EMPHASIS_PHRASE = seq(match(/\_/),as(:emphasis_phrase,repeat(seq(absent?(match(/\_/)),match(:inline_element)))),match(/\_/)) | |
ITALIC_PHRASE = seq(match(/__/),as(:italic_phrase,repeat(seq(absent?(match(/__/)),match(:inline_element)))),match(/__/)) | |
INLINE_TEXT = as(:inline_text,repeat(match(:inline_element),1)) | |
EXTENDED_BLOCK = matched?(DOUBLE_DOT){ |m| m.nil? ? @@extended = false : @@extended = true } | |
BLOCK_MODIFIER = repeat(seq(as(:block_modifier, BLOCK_TAG), choice( EXTENDED_BLOCK, DOT),repeat(SPACE,1)),0,1) | |
BULLET_LIST_ITEM = seq( repeat(NEWLINE), BULLET ,repeat(SPACE,1), as(:list_item, INLINE_TEXT )) | |
BULLET_LIST = as(:bullet_list,repeat(BULLET_LIST_ITEM,1)) | |
ORDERED_LIST_ITEM = seq( repeat(NEWLINE), COUNTER ,repeat(SPACE,1), as(:list_item, INLINE_TEXT )) | |
ORDERED_LIST = as(:ordered_list,repeat(ORDERED_LIST_ITEM,1)) | |
EM_DASH = as(:em_dash, match(/\-\-/)) | |
EN_DASH = as(:en_dash, match(/ \- /)) | |
ELLIPSIS = as(:ellipsis, match(/\.\.\./)) | |
TRADEMARK = as(:trademark, match(/\(tm\)/)) | |
REGISTERED = as(:registered, match(/\(r\)/)) | |
COPYRIGHT = as(:copyright, match(/\(c\)/)) | |
DIMENSION = seq(as(:inline_text,seq(match(/\d/),repeat(SPACE))),as(:dimension,match(/x/)),present?(seq(repeat(SPACE),match(/\d/)))) | |
def self.inline_element(str) | |
choice(DIMENSION,COPYRIGHT,REGISTERED,TRADEMARK,ELLIPSIS,EM_DASH,EN_DASH,ITALIC_PHRASE,STRONG_PHRASE,EMPHASIS_PHRASE,as(:inline_char,INLINE_CHAR)).call(str) | |
end | |
def self.block(str) | |
@@extended = false | |
seq(repeat(DOUBLE_NEWLINE),choice(ORDERED_LIST,BULLET_LIST, | |
as(:block,seq(BLOCK_MODIFIER,INLINE_TEXT))), | |
repeat(DOUBLE_NEWLINE)).call(str) | |
end | |
def self.root(str) | |
repeat(match(:block)).call(str) | |
end | |
def self.to_html(str) | |
r = root(str).compact | |
transform(r,HtmlCoverter) | |
end | |
end | |
#================================ | |
class TestTextileScanner < Test::Unit::TestCase | |
def test_simple | |
assert_equal("<p>123</p>",TextileScanner.to_html("123")) | |
assert_equal("<p>a</p><p>b</p>",TextileScanner.to_html("a\n\nb")) | |
assert_equal("<p>a<br />b</p>",TextileScanner.to_html("a\nb")) | |
assert_equal("<p>a</p><p>b</p><p>c</p>",TextileScanner.to_html("a\n\nb\n\nc")) | |
assert_equal("<p>123</p>",TextileScanner.to_html("\n\n123")) | |
end | |
def test_phrase | |
assert_equal("<p><strong>12</strong></p>",TextileScanner.to_html("*12*")) | |
assert_equal("<p>a<strong>12</strong>b</p>",TextileScanner.to_html("a*12*b")) | |
assert_equal("<p>a*12</p>",TextileScanner.to_html("a*12")) | |
assert_equal("<p><em>emphasis</em></p>",TextileScanner.to_html("_emphasis_")) | |
assert_equal("<p><em><strong>12</strong></em></p>",TextileScanner.to_html("_*12*_")) | |
assert_equal("<p>_<strong>12_</strong></p>",TextileScanner.to_html("_*12_*")) | |
assert_equal("<p><i>italic</i></p>",TextileScanner.to_html("__italic__")) | |
end | |
def test_block | |
assert_equal("<h1>test</h1>",TextileScanner.to_html("h1. test")) | |
assert_equal("<h1>test</h1><p>test2</p>",TextileScanner.to_html("h1. test\n\ntest2")) | |
assert_equal("<p>test</p>",TextileScanner.to_html("p. test")) | |
assert_equal("<blockquote>test</blockquote>",TextileScanner.to_html("bq. test")) | |
assert_equal("<p>123</p>",TextileScanner.to_html("\n\n123")) | |
assert_equal("<p>a</p><p>b</p><p>c</p>",TextileScanner.to_html("a\n\nb\n\nc")) | |
end | |
def test_extended_block | |
assert_equal("<h1>test<br /><br />test2</h1>",TextileScanner.to_html("h1.. test\n\ntest2")) | |
end | |
def test_escape | |
assert_equal("<p><br /></p>",TextileScanner.to_html("<br />")) | |
end | |
def test_punctuation | |
assert_equal("<p>a — b</p>",TextileScanner.to_html("a -- b")) | |
assert_equal("<p>a – b</p>",TextileScanner.to_html("a - b")) | |
assert_equal("<p>a-b</p>",TextileScanner.to_html("a-b")) | |
assert_equal("<p>Meanwhile…</p>",TextileScanner.to_html("Meanwhile...")) | |
assert_equal("<p>Registered® Trademark™ Copyright ©.</p>",TextileScanner.to_html("Registered(r) Trademark(tm) Copyright (c).")) | |
assert_equal("<p>1 × 2 × 3 = 6</p>",TextileScanner.to_html("1 x 2 x 3 = 6")) | |
assert_equal("<p>1×2×3 = 6</p>",TextileScanner.to_html("1x2x3 = 6")) | |
end | |
def test_list | |
assert_equal("<ul><li>one</li><li>two</li></ul>",TextileScanner.to_html("* one\n* two")) | |
assert_equal("<ul><li>one<ul><li>two</li></ul></li></ul>",TextileScanner.to_html("* one\n** two")) | |
assert_equal("<ul><li>one</li><li>two</li><li>tree</li></ul>",TextileScanner.to_html("* one\n* two\n* tree")) | |
assert_equal("<ul><li>one<ul><li>two</li></ul></li><li>tree</li></ul>",TextileScanner.to_html("* one\n** two\n* tree")) | |
assert_equal("<ol><li>one</li><li>two</li></ol>",TextileScanner.to_html("# one\n# two")) | |
test = TextileScanner.to_html("# Item one\n## Item one-A\n## Item one-B\n### Item one-B-a\n# Item two") | |
assert_equal("<ol><li>Item one<ol><li>Item one-A</li><li>Item one-B<ol><li>Item one-B-a</li></ol></li></ol></li><li>Item two</li></ol>",test) | |
end | |
end | |
class SimpleScanner | |
extend Parser | |
def self.last_part | |
lambda{ |str| | |
match(/c/).call(str) | |
} | |
end | |
def self.node(str) | |
match(/c/).call(str) | |
end | |
def self.root(str) | |
seq(match(/a/),match(/b/),last_part) | |
end | |
def self.simple(str) | |
match(:node).call(str) | |
end | |
def self.test_seq(str) | |
seq(match(/a/),match(/b/)).call(str) | |
end | |
def self.test_choice(str) | |
choice(match(/a/),match(/b/)).call(str) | |
end | |
def self.test_block(str) | |
i = 0 | |
result = repeat(as(:name,match(/a/)){ | |
if i == 3 | |
false | |
else | |
i += 1 | |
true | |
end | |
}).call(str) | |
result | |
end | |
end | |
class TestParser < Test::Unit::TestCase | |
include Parser | |
def show(str) | |
"<#{str}>" | |
end | |
def test_basic | |
assert_equal("aaa",repeat(match(/a/)).call("aaa").compact) | |
assert_equal({:test=>"aaa"},as(:test,repeat(match(/a/))).call("aaa").compact) | |
assert_equal([{:name=>"a"}, {:name=>"a"}, {:name=>"a"}],repeat(as(:name,match(/a/))).call("aaa").compact) | |
s = seq(match(/a/),as(:test,match(/b/)),match(/c/)) | |
assert_equal({:test=>"b"},s.call("abc").compact) | |
end | |
def test_transform | |
test1 = as(:show,repeat(match(/a/))).call("aaa").compact | |
assert_equal("<aaa>",transform(test1)) | |
test2 = repeat(as(:show,match(/a/))).call("aaa").compact | |
assert_equal("<a><a><a>",transform(test2)) | |
test3 = seq(match(/a/),as(:show,match(/b/)),match(/c/)).call("abc").compact | |
assert_equal("<b>",transform(test3)) | |
end | |
end | |
class TestScanner < Test::Unit::TestCase | |
def test_simple | |
SimpleScanner.root("ab") | |
SimpleScanner.simple("a") | |
assert_equal("ab",SimpleScanner.test_seq("ab").token) | |
assert_equal(nil,SimpleScanner.test_seq("ac")) | |
assert_equal("b",SimpleScanner.test_choice("b").token) | |
assert_equal(nil,SimpleScanner.test_choice("c")) | |
assert_equal("aaa",SimpleScanner.test_block("aaaa").token) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment