-
-
Save jystewart/82540 to your computer and use it in GitHub Desktop.
require 'html2textile' | |
first_block = <<END | |
<div class="column span-3"> | |
<h3 class="storytitle entry-title" id="post-312"> | |
<a href="http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" rel="bookmark">Converting HTML to Textile with Ruby</a> | |
</h3> | |
<p> | |
<span>23 November 2007</span> | |
(<abbr class="updated" title="2007-11-23T19:51:54+00:00">7:51 pm</abbr>) | |
</p> | |
<p> | |
By <span class="author vcard fn">James Stewart</span> | |
<br />filed under: | |
<a href="http://jystewart.net/process/category/snippets/" title="View all posts in Snippets" rel="category tag">Snippets</a> | |
<br />tagged: <a href="http://jystewart.net/process/tag/content-management/" rel="tag">content management</a>, | |
<a href="http://jystewart.net/process/tag/conversion/" rel="tag">conversion</a>, | |
<a href="http://jystewart.net/process/tag/html/" rel="tag">html</a>, | |
<a href="http://jystewart.net/process/tag/python/" rel="tag">Python</a>, | |
<a href="http://jystewart.net/process/tag/ruby/" rel="tag">ruby</a>, | |
<a href="http://jystewart.net/process/tag/textile/" rel="tag">textile</a> | |
</p> | |
<div class="feedback"> | |
<script src="http://feeds.feedburner.com/~s/jystewart/iLiN?i=http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" type="text/javascript" charset="utf-8"></script> | |
</div> | |
</div> | |
END | |
parser = HTMLToTextileParser.new | |
parser.feed(first_block) | |
puts parser.to_textile |
require 'sgml-parser' | |
# A class to convert HTML to textile. Based on the python parser | |
# found at http://aftnn.org/content/code/html2textile/ | |
# | |
# Read more at http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby | |
# | |
# Author:: James Stewart (mailto:[email protected]) | |
# Copyright:: Copyright (c) 2007 James Stewart | |
# License:: Distributes under the same terms as Ruby | |
# This class is an implementation of an SGMLParser designed to convert | |
# HTML to textile. | |
# | |
# Example usage: | |
# parser = HTMLToTextileParser.new | |
# parser.feed(input_html) | |
# puts parser.to_textile | |
class HTMLToTextileParser < SGMLParser | |
attr_accessor :result | |
attr_accessor :in_block | |
attr_accessor :data_stack | |
attr_accessor :a_href | |
attr_accessor :in_ul | |
attr_accessor :in_ol | |
@@permitted_tags = [] | |
@@permitted_attrs = [] | |
def initialize(verbose=nil) | |
@output = String.new | |
self.in_block = false | |
self.result = [] | |
self.data_stack = [] | |
super(verbose) | |
end | |
# Normalise space in the same manner as HTML. Any substring of multiple | |
# whitespace characters will be replaced with a single space char. | |
def normalise_space(s) | |
s.to_s.gsub(/\s+/x, ' ') | |
end | |
def build_styles_ids_and_classes(attributes) | |
idclass = '' | |
idclass += attributes['class'] if attributes.has_key?('class') | |
idclass += "\##{attributes['id']}" if attributes.has_key?('id') | |
idclass = "(#{idclass})" if idclass != '' | |
style = attributes.has_key?('style') ? "{#{attributes['style']}}" : "" | |
"#{idclass}#{style}" | |
end | |
def make_block_start_pair(tag, attributes) | |
attributes = attrs_to_hash(attributes) | |
class_style = build_styles_ids_and_classes(attributes) | |
write("#{tag}#{class_style}. ") | |
start_capture(tag) | |
end | |
def make_block_end_pair | |
stop_capture_and_write | |
write("\n\n") | |
end | |
def make_quicktag_start_pair(tag, wrapchar, attributes) | |
attributes = attrs_to_hash(attributes) | |
class_style = build_styles_ids_and_classes(attributes) | |
write([" ", "#{wrapchar}#{class_style}"]) | |
start_capture(tag) | |
end | |
def make_quicktag_end_pair(wrapchar) | |
stop_capture_and_write | |
write([wrapchar, " "]) | |
end | |
def write(d) | |
if self.data_stack.size < 2 | |
self.result += d.to_a | |
else | |
self.data_stack[-1] += d.to_a | |
end | |
end | |
def start_capture(tag) | |
self.in_block = tag | |
self.data_stack.push([]) | |
end | |
def stop_capture_and_write | |
self.in_block = false | |
self.write(self.data_stack.pop) | |
end | |
def handle_data(data) | |
write(normalise_space(data).strip) unless data.nil? or data == '' | |
end | |
%w[1 2 3 4 5 6].each do |num| | |
define_method "start_h#{num}" do |attributes| | |
make_block_start_pair("h#{num}", attributes) | |
end | |
define_method "end_h#{num}" do | |
make_block_end_pair | |
end | |
end | |
PAIRS = { 'blockquote' => 'bq', 'p' => 'p' } | |
QUICKTAGS = { 'b' => '*', 'strong' => '*', | |
'i' => '_', 'em' => '_', 'cite' => '??', 's' => '-', | |
'sup' => '^', 'sub' => '~', 'code' => '@', 'span' => '%'} | |
PAIRS.each do |key, value| | |
define_method "start_#{key}" do |attributes| | |
make_block_start_pair(value, attributes) | |
end | |
define_method "end_#{key}" do | |
make_block_end_pair | |
end | |
end | |
QUICKTAGS.each do |key, value| | |
define_method "start_#{key}" do |attributes| | |
make_quicktag_start_pair(key, value, attributes) | |
end | |
define_method "end_#{key}" do | |
make_quicktag_end_pair(value) | |
end | |
end | |
def start_ol(attrs) | |
self.in_ol = true | |
end | |
def end_ol | |
self.in_ol = false | |
write("\n") | |
end | |
def start_ul(attrs) | |
self.in_ul = true | |
end | |
def end_ul | |
self.in_ul = false | |
write("\n") | |
end | |
def start_li(attrs) | |
if self.in_ol | |
write("# ") | |
else | |
write("* ") | |
end | |
start_capture("li") | |
end | |
def end_li | |
stop_capture_and_write | |
write("\n") | |
end | |
def start_a(attrs) | |
attrs = attrs_to_hash(attrs) | |
self.a_href = attrs['href'] | |
if self.a_href: | |
write(" \"") | |
start_capture("a") | |
end | |
end | |
def end_a | |
if self.a_href: | |
stop_capture_and_write | |
write(["\":", self.a_href, " "]) | |
self.a_href = false | |
end | |
end | |
def attrs_to_hash(array) | |
array.inject({}) { |collection, part| collection[part[0].downcase] = part[1]; collection } | |
end | |
def start_img(attrs) | |
attrs = attrs_to_hash(attrs) | |
write([" !", attrs["src"], "! "]) | |
end | |
def end_img | |
end | |
def start_tr(attrs) | |
end | |
def end_tr | |
write("|\n") | |
end | |
def start_td(attrs) | |
write("|") | |
start_capture("td") | |
end | |
def end_td | |
stop_capture_and_write | |
write("|") | |
end | |
def start_br(attrs) | |
write("\n") | |
end | |
def unknown_starttag(tag, attrs) | |
if @@permitted_tags.include?(tag) | |
write(["<", tag]) | |
attrs.each do |key, value| | |
if @@permitted_attributes.include?(key) | |
write([" ", key, "=\"", value, "\""]) | |
end | |
end | |
end | |
end | |
def unknown_endtag(tag) | |
if @@permitted_tags.include?(tag) | |
write(["</", tag, ">"]) | |
end | |
end | |
# Return the textile after processing | |
def to_textile | |
result.join | |
end | |
# UNCONVERTED PYTHON METHODS | |
# | |
# def handle_charref(self, tag): | |
# self._write(unichr(int(tag))) | |
# | |
# def handle_entityref(self, tag): | |
# if self.entitydefs.has_key(tag): | |
# self._write(self.entitydefs[tag]) | |
# | |
# def handle_starttag(self, tag, method, attrs): | |
# method(dict(attrs)) | |
# | |
end |
# A parser for SGML, using the derived class as static DTD. | |
class SGMLParser | |
# Regular expressions used for parsing: | |
Interesting = /[&<]/ | |
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + | |
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + | |
'![^<>]*)?') | |
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ | |
Charref = /&#([0-9]+)[^0-9]/ | |
Starttagopen = /<[>a-zA-Z]/ | |
Endtagopen = /<\/[<>a-zA-Z]/ | |
Endbracket = /[<>]/ | |
Special = /<![^<>]*>/ | |
Commentopen = /<!--/ | |
Commentclose = /--[ \t\n]*>/ | |
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/ | |
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + | |
'(\s*=\s*' + | |
"('[^']*'" + | |
'|"[^"]*"' + | |
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?') | |
Entitydefs = | |
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} | |
def initialize(verbose=false) | |
@verbose = verbose | |
reset | |
end | |
def reset | |
@rawdata = '' | |
@stack = [] | |
@lasttag = '???' | |
@nomoretags = false | |
@literal = false | |
end | |
def has_context(gi) | |
@stack.include? gi | |
end | |
def setnomoretags | |
@nomoretags = true | |
@literal = true | |
end | |
def setliteral(*args) | |
@literal = true | |
end | |
def feed(data) | |
@rawdata << data | |
goahead(false) | |
end | |
def close | |
goahead(true) | |
end | |
def goahead(_end) | |
rawdata = @rawdata | |
i = 0 | |
n = rawdata.length | |
while i < n | |
if @nomoretags | |
handle_data(rawdata[i..(n-1)]) | |
i = n | |
break | |
end | |
j = rawdata.index(Interesting, i) | |
j = n unless j | |
if i < j | |
handle_data(rawdata[i..(j-1)]) | |
end | |
i = j | |
break if (i == n) | |
if rawdata[i] == ?< # | |
if rawdata.index(Starttagopen, i) == i | |
if @literal | |
handle_data(rawdata[i, 1]) | |
i += 1 | |
next | |
end | |
k = parse_starttag(i) | |
break unless k | |
i = k | |
next | |
end | |
if rawdata.index(Endtagopen, i) == i | |
k = parse_endtag(i) | |
break unless k | |
i = k | |
@literal = false | |
next | |
end | |
if rawdata.index(Commentopen, i) == i | |
if @literal | |
handle_data(rawdata[i,1]) | |
i += 1 | |
next | |
end | |
k = parse_comment(i) | |
break unless k | |
i += k | |
next | |
end | |
if rawdata.index(Special, i) == i | |
if @literal | |
handle_data(rawdata[i, 1]) | |
i += 1 | |
next | |
end | |
k = parse_special(i) | |
break unless k | |
i += k | |
next | |
end | |
elsif rawdata[i] == ?& # | |
if rawdata.index(Charref, i) == i | |
i += $&.length | |
handle_charref($1) | |
i -= 1 unless rawdata[i-1] == ?; | |
next | |
end | |
if rawdata.index(Entityref, i) == i | |
i += $&.length | |
handle_entityref($1) | |
i -= 1 unless rawdata[i-1] == ?; | |
next | |
end | |
else | |
raise RuntimeError, 'neither < nor & ??' | |
end | |
# We get here only if incomplete matches but | |
# nothing else | |
match = rawdata.index(Incomplete, i) | |
unless match == i | |
handle_data(rawdata[i, 1]) | |
i += 1 | |
next | |
end | |
j = match + $&.length | |
break if j == n # Really incomplete | |
handle_data(rawdata[i..(j-1)]) | |
i = j | |
end | |
# end while | |
if _end and i < n | |
handle_data(@rawdata[i..(n-1)]) | |
i = n | |
end | |
@rawdata = rawdata[i..-1] | |
end | |
def parse_comment(i) | |
rawdata = @rawdata | |
if rawdata[i, 4] != '<!--' | |
raise RuntimeError, 'unexpected call to handle_comment' | |
end | |
match = rawdata.index(Commentclose, i) | |
return nil unless match | |
matched_length = $&.length | |
j = match | |
handle_comment(rawdata[i+4..(j-1)]) | |
j = match + matched_length | |
return j-i | |
end | |
def parse_starttag(i) | |
rawdata = @rawdata | |
j = rawdata.index(Endbracket, i + 1) | |
return nil unless j | |
attrs = [] | |
if rawdata[i+1] == ?> # | |
# SGML shorthand: <> == <last open tag seen> | |
k = j | |
tag = @lasttag | |
else | |
match = rawdata.index(Tagfind, i + 1) | |
unless match | |
raise RuntimeError, 'unexpected call to parse_starttag' | |
end | |
k = i + 1 + ($&.length) | |
tag = $&.downcase | |
@lasttag = tag | |
end | |
while k < j | |
break unless rawdata.index(Attrfind, k) | |
matched_length = $&.length | |
attrname, rest, attrvalue = $1, $2, $3 | |
if not rest | |
attrvalue = '' # was: = attrname | |
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or | |
(attrvalue[0] == ?" && attrvalue[-1] == ?") | |
attrvalue = attrvalue[1..-2] | |
end | |
attrs << [attrname.downcase, attrvalue] | |
k += matched_length | |
end | |
if rawdata[j] == ?> # | |
j += 1 | |
end | |
finish_starttag(tag, attrs) | |
return j | |
end | |
def parse_endtag(i) | |
rawdata = @rawdata | |
j = rawdata.index(Endbracket, i + 1) | |
return nil unless j | |
tag = (rawdata[i+2..j-1].strip).downcase | |
if rawdata[j] == ?> # | |
j += 1 | |
end | |
finish_endtag(tag) | |
return j | |
end | |
def finish_starttag(tag, attrs) | |
method = 'start_' + tag | |
if self.respond_to?(method) | |
@stack << tag | |
handle_starttag(tag, method, attrs) | |
return 1 | |
else | |
method = 'do_' + tag | |
if self.respond_to?(method) | |
handle_starttag(tag, method, attrs) | |
return 0 | |
else | |
unknown_starttag(tag, attrs) | |
return -1 | |
end | |
end | |
end | |
def finish_endtag(tag) | |
if tag == '' | |
found = @stack.length - 1 | |
if found < 0 | |
unknown_endtag(tag) | |
return | |
end | |
else | |
unless @stack.include? tag | |
method = 'end_' + tag | |
unless self.respond_to?(method) | |
unknown_endtag(tag) | |
end | |
return | |
end | |
found = @stack.index(tag) #or @stack.length | |
end | |
while @stack.length > found | |
tag = @stack[-1] | |
method = 'end_' + tag | |
if respond_to?(method) | |
handle_endtag(tag, method) | |
else | |
unknown_endtag(tag) | |
end | |
@stack.pop | |
end | |
end | |
def parse_special(i) | |
rawdata = @rawdata | |
match = rawdata.index(Endbracket, i+1) | |
return nil unless match | |
matched_length = $&.length | |
handle_special(rawdata[i+1..(match-1)]) | |
return match - i + matched_length | |
end | |
def handle_starttag(tag, method, attrs) | |
self.send(method, attrs) | |
end | |
def handle_endtag(tag, method) | |
self.send(method) | |
end | |
def report_unbalanced(tag) | |
if @verbose | |
print '*** Unbalanced </' + tag + '>', "\n" | |
print '*** Stack:', self.stack, "\n" | |
end | |
end | |
def handle_charref(name) | |
n = Integer(name) | |
if !(0 <= n && n <= 255) | |
unknown_charref(name) | |
return | |
end | |
handle_data(n.chr) | |
end | |
def handle_entityref(name) | |
table = Entitydefs | |
if table.include?(name) | |
handle_data(table[name]) | |
else | |
unknown_entityref(name) | |
return | |
end | |
end | |
def handle_data(data) | |
end | |
def handle_comment(data) | |
end | |
def handle_special(data) | |
end | |
def unknown_starttag(tag, attrs) | |
end | |
def unknown_endtag(tag) | |
end | |
def unknown_charref(ref) | |
end | |
def unknown_entityref(ref) | |
end | |
end | |
It's been a long time since I've thought about this code at all! But cutting a gem of it does seem like a good idea. I'll probably get onto that at the weekend or next week and post here again when it's done.
Thanks, I was going to whip it up, but I thought I'd ask if you planned to do... I've found a couple of issues with code blocks (pre/code) and also named anchors.
I'll have a look at fixing them if I get a chance, I guess it would be a good idea to turn this into a proper github repo.
added HTML 2 markdown parser .. http://gist.github.com/441545#file_html2markdown.rb
I've created a proper repository out of it at http://github.com/jystewart/html2textile
It's a very quick conversion and could almost certainly do with work, but it's there. If you could fork that and add your markdown one then I can begin the proper packaging. Probably ought to write some tests/specs one of these days too...
Hi James, have you thought about cutting a gem of this? It works great for me by the way, thanks for your efforts.