Created
March 20, 2009 19:41
-
-
Save jystewart/82540 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'html2textile' | |
first_block = <<END | |
<div class="column span-3"> | |
<h3 class="storytitle entry-title" id="post-312"> | |
<a href="http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" rel="bookmark">Converting HTML to Textile with Ruby</a> | |
</h3> | |
<p> | |
<span>23 November 2007</span> | |
(<abbr class="updated" title="2007-11-23T19:51:54+00:00">7:51 pm</abbr>) | |
</p> | |
<p> | |
By <span class="author vcard fn">James Stewart</span> | |
<br />filed under: | |
<a href="http://jystewart.net/process/category/snippets/" title="View all posts in Snippets" rel="category tag">Snippets</a> | |
<br />tagged: <a href="http://jystewart.net/process/tag/content-management/" rel="tag">content management</a>, | |
<a href="http://jystewart.net/process/tag/conversion/" rel="tag">conversion</a>, | |
<a href="http://jystewart.net/process/tag/html/" rel="tag">html</a>, | |
<a href="http://jystewart.net/process/tag/python/" rel="tag">Python</a>, | |
<a href="http://jystewart.net/process/tag/ruby/" rel="tag">ruby</a>, | |
<a href="http://jystewart.net/process/tag/textile/" rel="tag">textile</a> | |
</p> | |
<div class="feedback"> | |
<script src="http://feeds.feedburner.com/~s/jystewart/iLiN?i=http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" type="text/javascript" charset="utf-8"></script> | |
</div> | |
</div> | |
END | |
parser = HTMLToTextileParser.new | |
parser.feed(first_block) | |
puts parser.to_textile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'sgml-parser' | |
# A class to convert HTML to textile. Based on the python parser | |
# found at http://aftnn.org/content/code/html2textile/ | |
# | |
# Read more at http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby | |
# | |
# Author:: James Stewart (mailto:[email protected]) | |
# Copyright:: Copyright (c) 2007 James Stewart | |
# License:: Distributes under the same terms as Ruby | |
# This class is an implementation of an SGMLParser designed to convert | |
# HTML to textile. | |
# | |
# Example usage: | |
# parser = HTMLToTextileParser.new | |
# parser.feed(input_html) | |
# puts parser.to_textile | |
class HTMLToTextileParser < SGMLParser | |
attr_accessor :result | |
attr_accessor :in_block | |
attr_accessor :data_stack | |
attr_accessor :a_href | |
attr_accessor :in_ul | |
attr_accessor :in_ol | |
@@permitted_tags = [] | |
@@permitted_attrs = [] | |
def initialize(verbose=nil) | |
@output = String.new | |
self.in_block = false | |
self.result = [] | |
self.data_stack = [] | |
super(verbose) | |
end | |
# Normalise space in the same manner as HTML. Any substring of multiple | |
# whitespace characters will be replaced with a single space char. | |
def normalise_space(s) | |
s.to_s.gsub(/\s+/x, ' ') | |
end | |
def build_styles_ids_and_classes(attributes) | |
idclass = '' | |
idclass += attributes['class'] if attributes.has_key?('class') | |
idclass += "\##{attributes['id']}" if attributes.has_key?('id') | |
idclass = "(#{idclass})" if idclass != '' | |
style = attributes.has_key?('style') ? "{#{attributes['style']}}" : "" | |
"#{idclass}#{style}" | |
end | |
def make_block_start_pair(tag, attributes) | |
attributes = attrs_to_hash(attributes) | |
class_style = build_styles_ids_and_classes(attributes) | |
write("#{tag}#{class_style}. ") | |
start_capture(tag) | |
end | |
def make_block_end_pair | |
stop_capture_and_write | |
write("\n\n") | |
end | |
def make_quicktag_start_pair(tag, wrapchar, attributes) | |
attributes = attrs_to_hash(attributes) | |
class_style = build_styles_ids_and_classes(attributes) | |
write([" ", "#{wrapchar}#{class_style}"]) | |
start_capture(tag) | |
end | |
def make_quicktag_end_pair(wrapchar) | |
stop_capture_and_write | |
write([wrapchar, " "]) | |
end | |
def write(d) | |
if self.data_stack.size < 2 | |
self.result += d.to_a | |
else | |
self.data_stack[-1] += d.to_a | |
end | |
end | |
def start_capture(tag) | |
self.in_block = tag | |
self.data_stack.push([]) | |
end | |
def stop_capture_and_write | |
self.in_block = false | |
self.write(self.data_stack.pop) | |
end | |
def handle_data(data) | |
write(normalise_space(data).strip) unless data.nil? or data == '' | |
end | |
%w[1 2 3 4 5 6].each do |num| | |
define_method "start_h#{num}" do |attributes| | |
make_block_start_pair("h#{num}", attributes) | |
end | |
define_method "end_h#{num}" do | |
make_block_end_pair | |
end | |
end | |
PAIRS = { 'blockquote' => 'bq', 'p' => 'p' } | |
QUICKTAGS = { 'b' => '*', 'strong' => '*', | |
'i' => '_', 'em' => '_', 'cite' => '??', 's' => '-', | |
'sup' => '^', 'sub' => '~', 'code' => '@', 'span' => '%'} | |
PAIRS.each do |key, value| | |
define_method "start_#{key}" do |attributes| | |
make_block_start_pair(value, attributes) | |
end | |
define_method "end_#{key}" do | |
make_block_end_pair | |
end | |
end | |
QUICKTAGS.each do |key, value| | |
define_method "start_#{key}" do |attributes| | |
make_quicktag_start_pair(key, value, attributes) | |
end | |
define_method "end_#{key}" do | |
make_quicktag_end_pair(value) | |
end | |
end | |
def start_ol(attrs) | |
self.in_ol = true | |
end | |
def end_ol | |
self.in_ol = false | |
write("\n") | |
end | |
def start_ul(attrs) | |
self.in_ul = true | |
end | |
def end_ul | |
self.in_ul = false | |
write("\n") | |
end | |
def start_li(attrs) | |
if self.in_ol | |
write("# ") | |
else | |
write("* ") | |
end | |
start_capture("li") | |
end | |
def end_li | |
stop_capture_and_write | |
write("\n") | |
end | |
def start_a(attrs) | |
attrs = attrs_to_hash(attrs) | |
self.a_href = attrs['href'] | |
if self.a_href: | |
write(" \"") | |
start_capture("a") | |
end | |
end | |
def end_a | |
if self.a_href: | |
stop_capture_and_write | |
write(["\":", self.a_href, " "]) | |
self.a_href = false | |
end | |
end | |
def attrs_to_hash(array) | |
array.inject({}) { |collection, part| collection[part[0].downcase] = part[1]; collection } | |
end | |
def start_img(attrs) | |
attrs = attrs_to_hash(attrs) | |
write([" !", attrs["src"], "! "]) | |
end | |
def end_img | |
end | |
def start_tr(attrs) | |
end | |
def end_tr | |
write("|\n") | |
end | |
def start_td(attrs) | |
write("|") | |
start_capture("td") | |
end | |
def end_td | |
stop_capture_and_write | |
write("|") | |
end | |
def start_br(attrs) | |
write("\n") | |
end | |
def unknown_starttag(tag, attrs) | |
if @@permitted_tags.include?(tag) | |
write(["<", tag]) | |
attrs.each do |key, value| | |
if @@permitted_attributes.include?(key) | |
write([" ", key, "=\"", value, "\""]) | |
end | |
end | |
end | |
end | |
def unknown_endtag(tag) | |
if @@permitted_tags.include?(tag) | |
write(["</", tag, ">"]) | |
end | |
end | |
# Return the textile after processing | |
def to_textile | |
result.join | |
end | |
# UNCONVERTED PYTHON METHODS | |
# | |
# def handle_charref(self, tag): | |
# self._write(unichr(int(tag))) | |
# | |
# def handle_entityref(self, tag): | |
# if self.entitydefs.has_key(tag): | |
# self._write(self.entitydefs[tag]) | |
# | |
# def handle_starttag(self, tag, method, attrs): | |
# method(dict(attrs)) | |
# | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A parser for SGML, using the derived class as static DTD. | |
class SGMLParser | |
# Regular expressions used for parsing: | |
Interesting = /[&<]/ | |
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + | |
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + | |
'![^<>]*)?') | |
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ | |
Charref = /&#([0-9]+)[^0-9]/ | |
Starttagopen = /<[>a-zA-Z]/ | |
Endtagopen = /<\/[<>a-zA-Z]/ | |
Endbracket = /[<>]/ | |
Special = /<![^<>]*>/ | |
Commentopen = /<!--/ | |
Commentclose = /--[ \t\n]*>/ | |
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/ | |
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + | |
'(\s*=\s*' + | |
"('[^']*'" + | |
'|"[^"]*"' + | |
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?') | |
Entitydefs = | |
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} | |
def initialize(verbose=false) | |
@verbose = verbose | |
reset | |
end | |
def reset | |
@rawdata = '' | |
@stack = [] | |
@lasttag = '???' | |
@nomoretags = false | |
@literal = false | |
end | |
def has_context(gi) | |
@stack.include? gi | |
end | |
def setnomoretags | |
@nomoretags = true | |
@literal = true | |
end | |
def setliteral(*args) | |
@literal = true | |
end | |
def feed(data) | |
@rawdata << data | |
goahead(false) | |
end | |
def close | |
goahead(true) | |
end | |
def goahead(_end) | |
rawdata = @rawdata | |
i = 0 | |
n = rawdata.length | |
while i < n | |
if @nomoretags | |
handle_data(rawdata[i..(n-1)]) | |
i = n | |
break | |
end | |
j = rawdata.index(Interesting, i) | |
j = n unless j | |
if i < j | |
handle_data(rawdata[i..(j-1)]) | |
end | |
i = j | |
break if (i == n) | |
if rawdata[i] == ?< # | |
if rawdata.index(Starttagopen, i) == i | |
if @literal | |
handle_data(rawdata[i, 1]) | |
i += 1 | |
next | |
end | |
k = parse_starttag(i) | |
break unless k | |
i = k | |
next | |
end | |
if rawdata.index(Endtagopen, i) == i | |
k = parse_endtag(i) | |
break unless k | |
i = k | |
@literal = false | |
next | |
end | |
if rawdata.index(Commentopen, i) == i | |
if @literal | |
handle_data(rawdata[i,1]) | |
i += 1 | |
next | |
end | |
k = parse_comment(i) | |
break unless k | |
i += k | |
next | |
end | |
if rawdata.index(Special, i) == i | |
if @literal | |
handle_data(rawdata[i, 1]) | |
i += 1 | |
next | |
end | |
k = parse_special(i) | |
break unless k | |
i += k | |
next | |
end | |
elsif rawdata[i] == ?& # | |
if rawdata.index(Charref, i) == i | |
i += $&.length | |
handle_charref($1) | |
i -= 1 unless rawdata[i-1] == ?; | |
next | |
end | |
if rawdata.index(Entityref, i) == i | |
i += $&.length | |
handle_entityref($1) | |
i -= 1 unless rawdata[i-1] == ?; | |
next | |
end | |
else | |
raise RuntimeError, 'neither < nor & ??' | |
end | |
# We get here only if incomplete matches but | |
# nothing else | |
match = rawdata.index(Incomplete, i) | |
unless match == i | |
handle_data(rawdata[i, 1]) | |
i += 1 | |
next | |
end | |
j = match + $&.length | |
break if j == n # Really incomplete | |
handle_data(rawdata[i..(j-1)]) | |
i = j | |
end | |
# end while | |
if _end and i < n | |
handle_data(@rawdata[i..(n-1)]) | |
i = n | |
end | |
@rawdata = rawdata[i..-1] | |
end | |
def parse_comment(i) | |
rawdata = @rawdata | |
if rawdata[i, 4] != '<!--' | |
raise RuntimeError, 'unexpected call to handle_comment' | |
end | |
match = rawdata.index(Commentclose, i) | |
return nil unless match | |
matched_length = $&.length | |
j = match | |
handle_comment(rawdata[i+4..(j-1)]) | |
j = match + matched_length | |
return j-i | |
end | |
def parse_starttag(i) | |
rawdata = @rawdata | |
j = rawdata.index(Endbracket, i + 1) | |
return nil unless j | |
attrs = [] | |
if rawdata[i+1] == ?> # | |
# SGML shorthand: <> == <last open tag seen> | |
k = j | |
tag = @lasttag | |
else | |
match = rawdata.index(Tagfind, i + 1) | |
unless match | |
raise RuntimeError, 'unexpected call to parse_starttag' | |
end | |
k = i + 1 + ($&.length) | |
tag = $&.downcase | |
@lasttag = tag | |
end | |
while k < j | |
break unless rawdata.index(Attrfind, k) | |
matched_length = $&.length | |
attrname, rest, attrvalue = $1, $2, $3 | |
if not rest | |
attrvalue = '' # was: = attrname | |
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or | |
(attrvalue[0] == ?" && attrvalue[-1] == ?") | |
attrvalue = attrvalue[1..-2] | |
end | |
attrs << [attrname.downcase, attrvalue] | |
k += matched_length | |
end | |
if rawdata[j] == ?> # | |
j += 1 | |
end | |
finish_starttag(tag, attrs) | |
return j | |
end | |
def parse_endtag(i) | |
rawdata = @rawdata | |
j = rawdata.index(Endbracket, i + 1) | |
return nil unless j | |
tag = (rawdata[i+2..j-1].strip).downcase | |
if rawdata[j] == ?> # | |
j += 1 | |
end | |
finish_endtag(tag) | |
return j | |
end | |
def finish_starttag(tag, attrs) | |
method = 'start_' + tag | |
if self.respond_to?(method) | |
@stack << tag | |
handle_starttag(tag, method, attrs) | |
return 1 | |
else | |
method = 'do_' + tag | |
if self.respond_to?(method) | |
handle_starttag(tag, method, attrs) | |
return 0 | |
else | |
unknown_starttag(tag, attrs) | |
return -1 | |
end | |
end | |
end | |
def finish_endtag(tag) | |
if tag == '' | |
found = @stack.length - 1 | |
if found < 0 | |
unknown_endtag(tag) | |
return | |
end | |
else | |
unless @stack.include? tag | |
method = 'end_' + tag | |
unless self.respond_to?(method) | |
unknown_endtag(tag) | |
end | |
return | |
end | |
found = @stack.index(tag) #or @stack.length | |
end | |
while @stack.length > found | |
tag = @stack[-1] | |
method = 'end_' + tag | |
if respond_to?(method) | |
handle_endtag(tag, method) | |
else | |
unknown_endtag(tag) | |
end | |
@stack.pop | |
end | |
end | |
def parse_special(i) | |
rawdata = @rawdata | |
match = rawdata.index(Endbracket, i+1) | |
return nil unless match | |
matched_length = $&.length | |
handle_special(rawdata[i+1..(match-1)]) | |
return match - i + matched_length | |
end | |
def handle_starttag(tag, method, attrs) | |
self.send(method, attrs) | |
end | |
def handle_endtag(tag, method) | |
self.send(method) | |
end | |
def report_unbalanced(tag) | |
if @verbose | |
print '*** Unbalanced </' + tag + '>', "\n" | |
print '*** Stack:', self.stack, "\n" | |
end | |
end | |
def handle_charref(name) | |
n = Integer(name) | |
if !(0 <= n && n <= 255) | |
unknown_charref(name) | |
return | |
end | |
handle_data(n.chr) | |
end | |
def handle_entityref(name) | |
table = Entitydefs | |
if table.include?(name) | |
handle_data(table[name]) | |
else | |
unknown_entityref(name) | |
return | |
end | |
end | |
def handle_data(data) | |
end | |
def handle_comment(data) | |
end | |
def handle_special(data) | |
end | |
def unknown_starttag(tag, attrs) | |
end | |
def unknown_endtag(tag) | |
end | |
def unknown_charref(ref) | |
end | |
def unknown_entityref(ref) | |
end | |
end | |
Thanks, I was going to whip it up, but I thought I'd ask if you planned to do... I've found a couple of issues with code blocks (pre/code) and also named anchors.
I'll have a look at fixing them if I get a chance, I guess it would be a good idea to turn this into a proper github repo.
added HTML 2 markdown parser .. http://gist.github.com/441545#file_html2markdown.rb
I've created a proper repository out of it at http://github.com/jystewart/html2textile
It's a very quick conversion and could almost certainly do with work, but it's there. If you could fork that and add your markdown one then I can begin the proper packaging. Probably ought to write some tests/specs one of these days too...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It's been a long time since I've thought about this code at all! But cutting a gem of it does seem like a good idea. I'll probably get onto that at the weekend or next week and post here again when it's done.