-
-
Save jystewart/82540 to your computer and use it in GitHub Desktop.
| require 'html2textile' | |
| first_block = <<END | |
| <div class="column span-3"> | |
| <h3 class="storytitle entry-title" id="post-312"> | |
| <a href="http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" rel="bookmark">Converting HTML to Textile with Ruby</a> | |
| </h3> | |
| <p> | |
| <span>23 November 2007</span> | |
| (<abbr class="updated" title="2007-11-23T19:51:54+00:00">7:51 pm</abbr>) | |
| </p> | |
| <p> | |
| By <span class="author vcard fn">James Stewart</span> | |
| <br />filed under: | |
| <a href="http://jystewart.net/process/category/snippets/" title="View all posts in Snippets" rel="category tag">Snippets</a> | |
| <br />tagged: <a href="http://jystewart.net/process/tag/content-management/" rel="tag">content management</a>, | |
| <a href="http://jystewart.net/process/tag/conversion/" rel="tag">conversion</a>, | |
| <a href="http://jystewart.net/process/tag/html/" rel="tag">html</a>, | |
| <a href="http://jystewart.net/process/tag/python/" rel="tag">Python</a>, | |
| <a href="http://jystewart.net/process/tag/ruby/" rel="tag">ruby</a>, | |
| <a href="http://jystewart.net/process/tag/textile/" rel="tag">textile</a> | |
| </p> | |
| <div class="feedback"> | |
| <script src="http://feeds.feedburner.com/~s/jystewart/iLiN?i=http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" type="text/javascript" charset="utf-8"></script> | |
| </div> | |
| </div> | |
| END | |
| parser = HTMLToTextileParser.new | |
| parser.feed(first_block) | |
| puts parser.to_textile |
| require 'sgml-parser' | |
| # A class to convert HTML to textile. Based on the python parser | |
| # found at http://aftnn.org/content/code/html2textile/ | |
| # | |
| # Read more at http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby | |
| # | |
| # Author:: James Stewart (mailto:[email protected]) | |
| # Copyright:: Copyright (c) 2007 James Stewart | |
| # License:: Distributes under the same terms as Ruby | |
| # This class is an implementation of an SGMLParser designed to convert | |
| # HTML to textile. | |
| # | |
| # Example usage: | |
| # parser = HTMLToTextileParser.new | |
| # parser.feed(input_html) | |
| # puts parser.to_textile | |
| class HTMLToTextileParser < SGMLParser | |
| attr_accessor :result | |
| attr_accessor :in_block | |
| attr_accessor :data_stack | |
| attr_accessor :a_href | |
| attr_accessor :in_ul | |
| attr_accessor :in_ol | |
| @@permitted_tags = [] | |
| @@permitted_attrs = [] | |
| def initialize(verbose=nil) | |
| @output = String.new | |
| self.in_block = false | |
| self.result = [] | |
| self.data_stack = [] | |
| super(verbose) | |
| end | |
| # Normalise space in the same manner as HTML. Any substring of multiple | |
| # whitespace characters will be replaced with a single space char. | |
| def normalise_space(s) | |
| s.to_s.gsub(/\s+/x, ' ') | |
| end | |
| def build_styles_ids_and_classes(attributes) | |
| idclass = '' | |
| idclass += attributes['class'] if attributes.has_key?('class') | |
| idclass += "\##{attributes['id']}" if attributes.has_key?('id') | |
| idclass = "(#{idclass})" if idclass != '' | |
| style = attributes.has_key?('style') ? "{#{attributes['style']}}" : "" | |
| "#{idclass}#{style}" | |
| end | |
| def make_block_start_pair(tag, attributes) | |
| attributes = attrs_to_hash(attributes) | |
| class_style = build_styles_ids_and_classes(attributes) | |
| write("#{tag}#{class_style}. ") | |
| start_capture(tag) | |
| end | |
| def make_block_end_pair | |
| stop_capture_and_write | |
| write("\n\n") | |
| end | |
| def make_quicktag_start_pair(tag, wrapchar, attributes) | |
| attributes = attrs_to_hash(attributes) | |
| class_style = build_styles_ids_and_classes(attributes) | |
| write([" ", "#{wrapchar}#{class_style}"]) | |
| start_capture(tag) | |
| end | |
| def make_quicktag_end_pair(wrapchar) | |
| stop_capture_and_write | |
| write([wrapchar, " "]) | |
| end | |
| def write(d) | |
| if self.data_stack.size < 2 | |
| self.result += d.to_a | |
| else | |
| self.data_stack[-1] += d.to_a | |
| end | |
| end | |
| def start_capture(tag) | |
| self.in_block = tag | |
| self.data_stack.push([]) | |
| end | |
| def stop_capture_and_write | |
| self.in_block = false | |
| self.write(self.data_stack.pop) | |
| end | |
| def handle_data(data) | |
| write(normalise_space(data).strip) unless data.nil? or data == '' | |
| end | |
| %w[1 2 3 4 5 6].each do |num| | |
| define_method "start_h#{num}" do |attributes| | |
| make_block_start_pair("h#{num}", attributes) | |
| end | |
| define_method "end_h#{num}" do | |
| make_block_end_pair | |
| end | |
| end | |
| PAIRS = { 'blockquote' => 'bq', 'p' => 'p' } | |
| QUICKTAGS = { 'b' => '*', 'strong' => '*', | |
| 'i' => '_', 'em' => '_', 'cite' => '??', 's' => '-', | |
| 'sup' => '^', 'sub' => '~', 'code' => '@', 'span' => '%'} | |
| PAIRS.each do |key, value| | |
| define_method "start_#{key}" do |attributes| | |
| make_block_start_pair(value, attributes) | |
| end | |
| define_method "end_#{key}" do | |
| make_block_end_pair | |
| end | |
| end | |
| QUICKTAGS.each do |key, value| | |
| define_method "start_#{key}" do |attributes| | |
| make_quicktag_start_pair(key, value, attributes) | |
| end | |
| define_method "end_#{key}" do | |
| make_quicktag_end_pair(value) | |
| end | |
| end | |
| def start_ol(attrs) | |
| self.in_ol = true | |
| end | |
| def end_ol | |
| self.in_ol = false | |
| write("\n") | |
| end | |
| def start_ul(attrs) | |
| self.in_ul = true | |
| end | |
| def end_ul | |
| self.in_ul = false | |
| write("\n") | |
| end | |
| def start_li(attrs) | |
| if self.in_ol | |
| write("# ") | |
| else | |
| write("* ") | |
| end | |
| start_capture("li") | |
| end | |
| def end_li | |
| stop_capture_and_write | |
| write("\n") | |
| end | |
| def start_a(attrs) | |
| attrs = attrs_to_hash(attrs) | |
| self.a_href = attrs['href'] | |
| if self.a_href: | |
| write(" \"") | |
| start_capture("a") | |
| end | |
| end | |
| def end_a | |
| if self.a_href: | |
| stop_capture_and_write | |
| write(["\":", self.a_href, " "]) | |
| self.a_href = false | |
| end | |
| end | |
| def attrs_to_hash(array) | |
| array.inject({}) { |collection, part| collection[part[0].downcase] = part[1]; collection } | |
| end | |
| def start_img(attrs) | |
| attrs = attrs_to_hash(attrs) | |
| write([" !", attrs["src"], "! "]) | |
| end | |
| def end_img | |
| end | |
| def start_tr(attrs) | |
| end | |
| def end_tr | |
| write("|\n") | |
| end | |
| def start_td(attrs) | |
| write("|") | |
| start_capture("td") | |
| end | |
| def end_td | |
| stop_capture_and_write | |
| write("|") | |
| end | |
| def start_br(attrs) | |
| write("\n") | |
| end | |
| def unknown_starttag(tag, attrs) | |
| if @@permitted_tags.include?(tag) | |
| write(["<", tag]) | |
| attrs.each do |key, value| | |
| if @@permitted_attributes.include?(key) | |
| write([" ", key, "=\"", value, "\""]) | |
| end | |
| end | |
| end | |
| end | |
| def unknown_endtag(tag) | |
| if @@permitted_tags.include?(tag) | |
| write(["</", tag, ">"]) | |
| end | |
| end | |
| # Return the textile after processing | |
| def to_textile | |
| result.join | |
| end | |
| # UNCONVERTED PYTHON METHODS | |
| # | |
| # def handle_charref(self, tag): | |
| # self._write(unichr(int(tag))) | |
| # | |
| # def handle_entityref(self, tag): | |
| # if self.entitydefs.has_key(tag): | |
| # self._write(self.entitydefs[tag]) | |
| # | |
| # def handle_starttag(self, tag, method, attrs): | |
| # method(dict(attrs)) | |
| # | |
| end |
| # A parser for SGML, using the derived class as static DTD. | |
| class SGMLParser | |
| # Regular expressions used for parsing: | |
| Interesting = /[&<]/ | |
| Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + | |
| '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + | |
| '![^<>]*)?') | |
| Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ | |
| Charref = /&#([0-9]+)[^0-9]/ | |
| Starttagopen = /<[>a-zA-Z]/ | |
| Endtagopen = /<\/[<>a-zA-Z]/ | |
| Endbracket = /[<>]/ | |
| Special = /<![^<>]*>/ | |
| Commentopen = /<!--/ | |
| Commentclose = /--[ \t\n]*>/ | |
| Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/ | |
| Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + | |
| '(\s*=\s*' + | |
| "('[^']*'" + | |
| '|"[^"]*"' + | |
| '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?') | |
| Entitydefs = | |
| {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} | |
| def initialize(verbose=false) | |
| @verbose = verbose | |
| reset | |
| end | |
| def reset | |
| @rawdata = '' | |
| @stack = [] | |
| @lasttag = '???' | |
| @nomoretags = false | |
| @literal = false | |
| end | |
| def has_context(gi) | |
| @stack.include? gi | |
| end | |
| def setnomoretags | |
| @nomoretags = true | |
| @literal = true | |
| end | |
| def setliteral(*args) | |
| @literal = true | |
| end | |
| def feed(data) | |
| @rawdata << data | |
| goahead(false) | |
| end | |
| def close | |
| goahead(true) | |
| end | |
| def goahead(_end) | |
| rawdata = @rawdata | |
| i = 0 | |
| n = rawdata.length | |
| while i < n | |
| if @nomoretags | |
| handle_data(rawdata[i..(n-1)]) | |
| i = n | |
| break | |
| end | |
| j = rawdata.index(Interesting, i) | |
| j = n unless j | |
| if i < j | |
| handle_data(rawdata[i..(j-1)]) | |
| end | |
| i = j | |
| break if (i == n) | |
| if rawdata[i] == ?< # | |
| if rawdata.index(Starttagopen, i) == i | |
| if @literal | |
| handle_data(rawdata[i, 1]) | |
| i += 1 | |
| next | |
| end | |
| k = parse_starttag(i) | |
| break unless k | |
| i = k | |
| next | |
| end | |
| if rawdata.index(Endtagopen, i) == i | |
| k = parse_endtag(i) | |
| break unless k | |
| i = k | |
| @literal = false | |
| next | |
| end | |
| if rawdata.index(Commentopen, i) == i | |
| if @literal | |
| handle_data(rawdata[i,1]) | |
| i += 1 | |
| next | |
| end | |
| k = parse_comment(i) | |
| break unless k | |
| i += k | |
| next | |
| end | |
| if rawdata.index(Special, i) == i | |
| if @literal | |
| handle_data(rawdata[i, 1]) | |
| i += 1 | |
| next | |
| end | |
| k = parse_special(i) | |
| break unless k | |
| i += k | |
| next | |
| end | |
| elsif rawdata[i] == ?& # | |
| if rawdata.index(Charref, i) == i | |
| i += $&.length | |
| handle_charref($1) | |
| i -= 1 unless rawdata[i-1] == ?; | |
| next | |
| end | |
| if rawdata.index(Entityref, i) == i | |
| i += $&.length | |
| handle_entityref($1) | |
| i -= 1 unless rawdata[i-1] == ?; | |
| next | |
| end | |
| else | |
| raise RuntimeError, 'neither < nor & ??' | |
| end | |
| # We get here only if incomplete matches but | |
| # nothing else | |
| match = rawdata.index(Incomplete, i) | |
| unless match == i | |
| handle_data(rawdata[i, 1]) | |
| i += 1 | |
| next | |
| end | |
| j = match + $&.length | |
| break if j == n # Really incomplete | |
| handle_data(rawdata[i..(j-1)]) | |
| i = j | |
| end | |
| # end while | |
| if _end and i < n | |
| handle_data(@rawdata[i..(n-1)]) | |
| i = n | |
| end | |
| @rawdata = rawdata[i..-1] | |
| end | |
| def parse_comment(i) | |
| rawdata = @rawdata | |
| if rawdata[i, 4] != '<!--' | |
| raise RuntimeError, 'unexpected call to handle_comment' | |
| end | |
| match = rawdata.index(Commentclose, i) | |
| return nil unless match | |
| matched_length = $&.length | |
| j = match | |
| handle_comment(rawdata[i+4..(j-1)]) | |
| j = match + matched_length | |
| return j-i | |
| end | |
| def parse_starttag(i) | |
| rawdata = @rawdata | |
| j = rawdata.index(Endbracket, i + 1) | |
| return nil unless j | |
| attrs = [] | |
| if rawdata[i+1] == ?> # | |
| # SGML shorthand: <> == <last open tag seen> | |
| k = j | |
| tag = @lasttag | |
| else | |
| match = rawdata.index(Tagfind, i + 1) | |
| unless match | |
| raise RuntimeError, 'unexpected call to parse_starttag' | |
| end | |
| k = i + 1 + ($&.length) | |
| tag = $&.downcase | |
| @lasttag = tag | |
| end | |
| while k < j | |
| break unless rawdata.index(Attrfind, k) | |
| matched_length = $&.length | |
| attrname, rest, attrvalue = $1, $2, $3 | |
| if not rest | |
| attrvalue = '' # was: = attrname | |
| elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or | |
| (attrvalue[0] == ?" && attrvalue[-1] == ?") | |
| attrvalue = attrvalue[1..-2] | |
| end | |
| attrs << [attrname.downcase, attrvalue] | |
| k += matched_length | |
| end | |
| if rawdata[j] == ?> # | |
| j += 1 | |
| end | |
| finish_starttag(tag, attrs) | |
| return j | |
| end | |
| def parse_endtag(i) | |
| rawdata = @rawdata | |
| j = rawdata.index(Endbracket, i + 1) | |
| return nil unless j | |
| tag = (rawdata[i+2..j-1].strip).downcase | |
| if rawdata[j] == ?> # | |
| j += 1 | |
| end | |
| finish_endtag(tag) | |
| return j | |
| end | |
| def finish_starttag(tag, attrs) | |
| method = 'start_' + tag | |
| if self.respond_to?(method) | |
| @stack << tag | |
| handle_starttag(tag, method, attrs) | |
| return 1 | |
| else | |
| method = 'do_' + tag | |
| if self.respond_to?(method) | |
| handle_starttag(tag, method, attrs) | |
| return 0 | |
| else | |
| unknown_starttag(tag, attrs) | |
| return -1 | |
| end | |
| end | |
| end | |
| def finish_endtag(tag) | |
| if tag == '' | |
| found = @stack.length - 1 | |
| if found < 0 | |
| unknown_endtag(tag) | |
| return | |
| end | |
| else | |
| unless @stack.include? tag | |
| method = 'end_' + tag | |
| unless self.respond_to?(method) | |
| unknown_endtag(tag) | |
| end | |
| return | |
| end | |
| found = @stack.index(tag) #or @stack.length | |
| end | |
| while @stack.length > found | |
| tag = @stack[-1] | |
| method = 'end_' + tag | |
| if respond_to?(method) | |
| handle_endtag(tag, method) | |
| else | |
| unknown_endtag(tag) | |
| end | |
| @stack.pop | |
| end | |
| end | |
| def parse_special(i) | |
| rawdata = @rawdata | |
| match = rawdata.index(Endbracket, i+1) | |
| return nil unless match | |
| matched_length = $&.length | |
| handle_special(rawdata[i+1..(match-1)]) | |
| return match - i + matched_length | |
| end | |
| def handle_starttag(tag, method, attrs) | |
| self.send(method, attrs) | |
| end | |
| def handle_endtag(tag, method) | |
| self.send(method) | |
| end | |
| def report_unbalanced(tag) | |
| if @verbose | |
| print '*** Unbalanced </' + tag + '>', "\n" | |
| print '*** Stack:', self.stack, "\n" | |
| end | |
| end | |
| def handle_charref(name) | |
| n = Integer(name) | |
| if !(0 <= n && n <= 255) | |
| unknown_charref(name) | |
| return | |
| end | |
| handle_data(n.chr) | |
| end | |
| def handle_entityref(name) | |
| table = Entitydefs | |
| if table.include?(name) | |
| handle_data(table[name]) | |
| else | |
| unknown_entityref(name) | |
| return | |
| end | |
| end | |
| def handle_data(data) | |
| end | |
| def handle_comment(data) | |
| end | |
| def handle_special(data) | |
| end | |
| def unknown_starttag(tag, attrs) | |
| end | |
| def unknown_endtag(tag) | |
| end | |
| def unknown_charref(ref) | |
| end | |
| def unknown_entityref(ref) | |
| end | |
| end | |
It's been a long time since I've thought about this code at all! But cutting a gem of it does seem like a good idea. I'll probably get onto that at the weekend or next week and post here again when it's done.
Thanks, I was going to whip it up, but I thought I'd ask if you planned to do... I've found a couple of issues with code blocks (pre/code) and also named anchors.
I'll have a look at fixing them if I get a chance, I guess it would be a good idea to turn this into a proper github repo.
added HTML 2 markdown parser .. http://gist.github.com/441545#file_html2markdown.rb
I've created a proper repository out of it at http://github.com/jystewart/html2textile
It's a very quick conversion and could almost certainly do with work, but it's there. If you could fork that and add your markdown one then I can begin the proper packaging. Probably ought to write some tests/specs one of these days too...
Hi James, have you thought about cutting a gem of this? It works great for me by the way, thanks for your efforts.