jystewart · March 20, 2009 19:41 · jasonm23 · Jun 17, 2010 · jystewart · Jun 17, 2010
diff --git a/example.rb b/example.rb
 require 'html2textile'

 first_block = <<END
 <div class="column span-3">
  <h3 class="storytitle entry-title" id="post-312">
    <a href="http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" rel="bookmark">Converting HTML to Textile with Ruby</a>
  </h3>

  <p>
    <span>23 November 2007</span> 
    (<abbr class="updated" title="2007-11-23T19:51:54+00:00">7:51 pm</abbr>)
  </p>
 		
  <p>
    By <span class="author vcard fn">James Stewart</span>
    <br />filed under: 
      <a href="http://jystewart.net/process/category/snippets/" title="View all posts in Snippets" rel="category tag">Snippets</a>
      <br />tagged: <a href="http://jystewart.net/process/tag/content-management/" rel="tag">content management</a>,
      <a href="http://jystewart.net/process/tag/conversion/" rel="tag">conversion</a>,
      <a href="http://jystewart.net/process/tag/html/" rel="tag">html</a>,
      <a href="http://jystewart.net/process/tag/python/" rel="tag">Python</a>,
      <a href="http://jystewart.net/process/tag/ruby/" rel="tag">ruby</a>,
      <a href="http://jystewart.net/process/tag/textile/" rel="tag">textile</a>
  </p>

 		
  <div class="feedback">
    <script src="http://feeds.feedburner.com/~s/jystewart/iLiN?i=http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" type="text/javascript" charset="utf-8"></script>
  </div>
 </div>
 END

 parser = HTMLToTextileParser.new
 parser.feed(first_block)
 puts parser.to_textile
diff --git a/html2textile.rb b/html2textile.rb
 require 'sgml-parser'

 # A class to convert HTML to textile. Based on the python parser
 # found at http://aftnn.org/content/code/html2textile/
 #
 # Read more at http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby
 #
 # Author::    James Stewart  (mailto:[email protected])
 # Copyright:: Copyright (c) 2007 James Stewart
 # License::   Distributes under the same terms as Ruby

 # This class is an implementation of an SGMLParser designed to convert
 # HTML to textile.
 # 
 # Example usage:
 #   parser = HTMLToTextileParser.new
 #   parser.feed(input_html)
 #   puts parser.to_textile
 class HTMLToTextileParser < SGMLParser
  
  attr_accessor :result
  attr_accessor :in_block
  attr_accessor :data_stack
  attr_accessor :a_href
  attr_accessor :in_ul
  attr_accessor :in_ol
  
  @@permitted_tags = []
  @@permitted_attrs = []
  
  def initialize(verbose=nil)
    @output = String.new
    self.in_block = false
    self.result = []
    self.data_stack = []
    super(verbose)
  end
  
  # Normalise space in the same manner as HTML. Any substring of multiple
  # whitespace characters will be replaced with a single space char.
  def normalise_space(s)
    s.to_s.gsub(/\s+/x, ' ')
  end
  
  def build_styles_ids_and_classes(attributes)
    idclass = ''
    idclass += attributes['class'] if attributes.has_key?('class')
    idclass += "\##{attributes['id']}" if attributes.has_key?('id')
    idclass = "(#{idclass})" if idclass != ''
    
    style = attributes.has_key?('style') ? "{#{attributes['style']}}" : ""
    "#{idclass}#{style}"
  end
  
  def make_block_start_pair(tag, attributes)
    attributes = attrs_to_hash(attributes)
    class_style = build_styles_ids_and_classes(attributes)
    write("#{tag}#{class_style}. ")
    start_capture(tag)
  end
  
  def make_block_end_pair
    stop_capture_and_write
    write("\n\n")
  end
  
  def make_quicktag_start_pair(tag, wrapchar, attributes)
    attributes = attrs_to_hash(attributes)
    class_style = build_styles_ids_and_classes(attributes)
    write([" ", "#{wrapchar}#{class_style}"])
    start_capture(tag)
  end

  def make_quicktag_end_pair(wrapchar)
    stop_capture_and_write
    write([wrapchar, " "])
  end
  
  def write(d)
    if self.data_stack.size < 2
      self.result += d.to_a
    else
      self.data_stack[-1] += d.to_a
    end
  end
          
  def start_capture(tag)
    self.in_block = tag
    self.data_stack.push([])
  end
  
  def stop_capture_and_write
    self.in_block = false
    self.write(self.data_stack.pop)
  end

  def handle_data(data)
    write(normalise_space(data).strip) unless data.nil? or data == ''
  end

  %w[1 2 3 4 5 6].each do |num|
    define_method "start_h#{num}" do |attributes|
      make_block_start_pair("h#{num}", attributes)
    end
    
    define_method "end_h#{num}" do
      make_block_end_pair
    end
  end

  PAIRS = { 'blockquote' => 'bq', 'p' => 'p' }
  QUICKTAGS = { 'b' => '*', 'strong' => '*', 
    'i' => '_', 'em' => '_', 'cite' => '??', 's' => '-', 
    'sup' => '^', 'sub' => '~', 'code' => '@', 'span' => '%'}
  
  PAIRS.each do |key, value|
    define_method "start_#{key}" do |attributes|
      make_block_start_pair(value, attributes)
    end
    
    define_method "end_#{key}" do
      make_block_end_pair
    end
  end
  
  QUICKTAGS.each do |key, value|
    define_method "start_#{key}" do |attributes|
      make_quicktag_start_pair(key, value, attributes)
    end
    
    define_method "end_#{key}" do
      make_quicktag_end_pair(value)
    end
  end
  
  def start_ol(attrs)
    self.in_ol = true
  end

  def end_ol
    self.in_ol = false
    write("\n")
  end

  def start_ul(attrs)
    self.in_ul = true
  end

  def end_ul
    self.in_ul = false
    write("\n")
  end
  
  def start_li(attrs)
    if self.in_ol
      write("# ")
    else
      write("* ")
    end
    
    start_capture("li")
  end

  def end_li
    stop_capture_and_write
    write("\n")
  end

  def start_a(attrs)
    attrs = attrs_to_hash(attrs)
    self.a_href = attrs['href']

    if self.a_href:
      write(" \"")
      start_capture("a")
    end
  end

  def end_a
    if self.a_href:
      stop_capture_and_write
      write(["\":", self.a_href, " "])
      self.a_href = false
    end
  end

  def attrs_to_hash(array)
    array.inject({}) { |collection, part| collection[part[0].downcase] = part[1]; collection }
  end

  def start_img(attrs)
    attrs = attrs_to_hash(attrs)
    write([" !", attrs["src"], "! "])
  end
  
  def end_img
  end

  def start_tr(attrs)
  end

  def end_tr
    write("|\n")
  end

  def start_td(attrs)
    write("|")
    start_capture("td")
  end

  def end_td
    stop_capture_and_write
    write("|")
  end

  def start_br(attrs)
    write("\n")
  end
  
  def unknown_starttag(tag, attrs)
    if @@permitted_tags.include?(tag)
      write(["<", tag])
      attrs.each do |key, value|
        if @@permitted_attributes.include?(key)
          write([" ", key, "=\"", value, "\""])
        end
      end
    end
  end
            
  def unknown_endtag(tag)
    if @@permitted_tags.include?(tag)
      write(["</", tag, ">"])
    end
  end
  
  # Return the textile after processing
  def to_textile
    result.join
  end
  
  # UNCONVERTED PYTHON METHODS
  #
  # def handle_charref(self, tag):
  #     self._write(unichr(int(tag)))
  #     
  # def handle_entityref(self, tag):
  #     if self.entitydefs.has_key(tag): 
  #         self._write(self.entitydefs[tag])
  # 
  # def handle_starttag(self, tag, method, attrs):
  #     method(dict(attrs))
  #     
  
 end
diff --git a/sgml-parser.rb b/sgml-parser.rb
 # A parser for SGML, using the derived class as static DTD.

 class SGMLParser

  # Regular expressions used for parsing:
  Interesting = /[&<]/
  Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
                              '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
                              '![^<>]*)?')

  Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
  Charref = /&#([0-9]+)[^0-9]/

  Starttagopen = /<[>a-zA-Z]/
  Endtagopen = /<\/[<>a-zA-Z]/
  Endbracket = /[<>]/
  Special = /<![^<>]*>/
  Commentopen = /<!--/
  Commentclose = /--[ \t\n]*>/
  Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
  Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
                            '(\s*=\s*' +
                            "('[^']*'" +
                            '|"[^"]*"' +
                            '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')

  Entitydefs =
    {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

  def initialize(verbose=false)
    @verbose = verbose
    reset
  end

  def reset
    @rawdata = ''
    @stack = []
    @lasttag = '???'
    @nomoretags = false
    @literal = false
  end

  def has_context(gi)
    @stack.include? gi
  end

  def setnomoretags
    @nomoretags = true
    @literal = true
  end

  def setliteral(*args)
    @literal = true
  end

  def feed(data)
    @rawdata << data
    goahead(false)
  end

  def close
    goahead(true)
  end

  def goahead(_end)
    rawdata = @rawdata
    i = 0
    n = rawdata.length
    while i < n
      if @nomoretags
        handle_data(rawdata[i..(n-1)])
        i = n
        break
      end
      j = rawdata.index(Interesting, i)
      j = n unless j
      if i < j
        handle_data(rawdata[i..(j-1)])
      end
      i = j
      break if (i == n)
      if rawdata[i] == ?< #
        if rawdata.index(Starttagopen, i) == i
          if @literal
            handle_data(rawdata[i, 1])
            i += 1
            next
          end
          k = parse_starttag(i)
          break unless k
          i = k
          next
        end
        if rawdata.index(Endtagopen, i) == i
          k = parse_endtag(i)
          break unless k
          i = k
          @literal = false
          next
        end
        if rawdata.index(Commentopen, i) == i
          if @literal
            handle_data(rawdata[i,1])
            i += 1
            next
          end
          k = parse_comment(i)
          break unless k
          i += k
          next
        end
        if rawdata.index(Special, i) == i
          if @literal
            handle_data(rawdata[i, 1])
            i += 1
            next
          end
          k = parse_special(i)
          break unless k
          i += k
          next
        end
      elsif rawdata[i] == ?& #
        if rawdata.index(Charref, i) == i
          i += $&.length
          handle_charref($1)
          i -= 1 unless rawdata[i-1] == ?;
          next
        end
        if rawdata.index(Entityref, i) == i
          i += $&.length
          handle_entityref($1)
          i -= 1 unless rawdata[i-1] == ?;
          next
        end
      else
        raise RuntimeError, 'neither < nor & ??'
      end
      # We get here only if incomplete matches but
      # nothing else
      match = rawdata.index(Incomplete, i)
      unless match == i
        handle_data(rawdata[i, 1])
        i += 1
        next
      end
      j = match + $&.length
      break if j == n # Really incomplete
      handle_data(rawdata[i..(j-1)])
      i = j
    end
    # end while
    if _end and i < n
      handle_data(@rawdata[i..(n-1)])
      i = n
    end
    @rawdata = rawdata[i..-1]
  end

  def parse_comment(i)
    rawdata = @rawdata
    if rawdata[i, 4] != '<!--'
      raise RuntimeError, 'unexpected call to handle_comment'
    end
    match = rawdata.index(Commentclose, i)
    return nil unless match
    matched_length = $&.length
    j = match
    handle_comment(rawdata[i+4..(j-1)])
    j = match + matched_length
    return j-i
  end

  def parse_starttag(i)
    rawdata = @rawdata
    j = rawdata.index(Endbracket, i + 1)
    return nil unless j
    attrs = []
    if rawdata[i+1] == ?> #
      # SGML shorthand: <> == <last open tag seen>
      k = j
      tag = @lasttag
    else
      match = rawdata.index(Tagfind, i + 1)
      unless match
        raise RuntimeError, 'unexpected call to parse_starttag'
      end
      k = i + 1 + ($&.length)
      tag = $&.downcase
      @lasttag = tag
    end
    while k < j
      break unless rawdata.index(Attrfind, k)
      matched_length = $&.length
      attrname, rest, attrvalue = $1, $2, $3
      if not rest
        attrvalue = '' # was: = attrname
      elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
          (attrvalue[0] == ?" && attrvalue[-1] == ?")
        attrvalue = attrvalue[1..-2]
      end
      attrs << [attrname.downcase, attrvalue]
      k += matched_length
    end
    if rawdata[j] == ?> #
      j += 1
    end
    finish_starttag(tag, attrs)
    return j
  end

  def parse_endtag(i)
    rawdata = @rawdata
    j = rawdata.index(Endbracket, i + 1)
    return nil unless j
    tag = (rawdata[i+2..j-1].strip).downcase
    if rawdata[j] == ?> #
      j += 1
    end
    finish_endtag(tag)
    return j
  end

  def finish_starttag(tag, attrs)
    method = 'start_' + tag
    if self.respond_to?(method)
      @stack << tag
      handle_starttag(tag, method, attrs)
      return 1
    else
      method = 'do_' + tag
      if self.respond_to?(method)
        handle_starttag(tag, method, attrs)
        return 0
      else
        unknown_starttag(tag, attrs)
        return -1
      end
    end
  end

  def finish_endtag(tag)
    if tag == ''
      found = @stack.length - 1
      if found < 0
        unknown_endtag(tag)
        return
      end
    else
      unless @stack.include? tag
        method = 'end_' + tag
        unless self.respond_to?(method)
          unknown_endtag(tag)
        end
        return
      end
      found = @stack.index(tag) #or @stack.length
    end
    while @stack.length > found
      tag = @stack[-1]
      method = 'end_' + tag
      if respond_to?(method)
        handle_endtag(tag, method)
      else
        unknown_endtag(tag)
      end
      @stack.pop
    end
  end

  def parse_special(i)
    rawdata = @rawdata
    match = rawdata.index(Endbracket, i+1)
    return nil unless match
    matched_length = $&.length
    handle_special(rawdata[i+1..(match-1)])
    return match - i + matched_length
  end

  def handle_starttag(tag, method, attrs)
    self.send(method, attrs)
  end

  def handle_endtag(tag, method)
    self.send(method)
  end

  def report_unbalanced(tag)
    if @verbose
      print '*** Unbalanced </' + tag + '>', "\n"
      print '*** Stack:', self.stack, "\n"
    end
  end

  def handle_charref(name)
    n = Integer(name)
    if !(0 <= n && n <= 255)
      unknown_charref(name)
      return
    end
    handle_data(n.chr)
  end

  def handle_entityref(name)
    table = Entitydefs
    if table.include?(name)
      handle_data(table[name])
    else
      unknown_entityref(name)
      return
    end
  end

  def handle_data(data)
  end

  def handle_comment(data)
  end

  def handle_special(data)
  end

  def unknown_starttag(tag, attrs)
  end
  def unknown_endtag(tag)
  end
  def unknown_charref(ref)
  end
  def unknown_entityref(ref)
  end

 end
	require 'html2textile'

	first_block = <<END
	<div class="column span-3">
	<h3 class="storytitle entry-title" id="post-312">
	<a href="http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" rel="bookmark">Converting HTML to Textile with Ruby</a>
	</h3>

	<p>
	<span>23 November 2007</span>
	(<abbr class="updated" title="2007-11-23T19:51:54+00:00">7:51 pm</abbr>)
	</p>

	<p>
	By <span class="author vcard fn">James Stewart</span>
	<br />filed under:
	<a href="http://jystewart.net/process/category/snippets/" title="View all posts in Snippets" rel="category tag">Snippets</a>
	<br />tagged: <a href="http://jystewart.net/process/tag/content-management/" rel="tag">content management</a>,
	<a href="http://jystewart.net/process/tag/conversion/" rel="tag">conversion</a>,
	<a href="http://jystewart.net/process/tag/html/" rel="tag">html</a>,
	<a href="http://jystewart.net/process/tag/python/" rel="tag">Python</a>,
	<a href="http://jystewart.net/process/tag/ruby/" rel="tag">ruby</a>,
	<a href="http://jystewart.net/process/tag/textile/" rel="tag">textile</a>
	</p>


	<div class="feedback">
	<script src="http://feeds.feedburner.com/~s/jystewart/iLiN?i=http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby/" type="text/javascript" charset="utf-8"></script>
	</div>
	</div>
	END

	parser = HTMLToTextileParser.new
	parser.feed(first_block)
	puts parser.to_textile
	require 'sgml-parser'

	# A class to convert HTML to textile. Based on the python parser
	# found at http://aftnn.org/content/code/html2textile/
	#
	# Read more at http://jystewart.net/process/2007/11/converting-html-to-textile-with-ruby
	#
	# Author:: James Stewart (mailto:[email protected])
	# Copyright:: Copyright (c) 2007 James Stewart
	# License:: Distributes under the same terms as Ruby

	# This class is an implementation of an SGMLParser designed to convert
	# HTML to textile.
	#
	# Example usage:
	# parser = HTMLToTextileParser.new
	# parser.feed(input_html)
	# puts parser.to_textile
	class HTMLToTextileParser < SGMLParser

	attr_accessor :result
	attr_accessor :in_block
	attr_accessor :data_stack
	attr_accessor :a_href
	attr_accessor :in_ul
	attr_accessor :in_ol

	@@permitted_tags = []
	@@permitted_attrs = []

	def initialize(verbose=nil)
	@output = String.new
	self.in_block = false
	self.result = []
	self.data_stack = []
	super(verbose)
	end

	# Normalise space in the same manner as HTML. Any substring of multiple
	# whitespace characters will be replaced with a single space char.
	def normalise_space(s)
	s.to_s.gsub(/\s+/x, ' ')
	end

	def build_styles_ids_and_classes(attributes)
	idclass = ''
	idclass += attributes['class'] if attributes.has_key?('class')
	idclass += "\##{attributes['id']}" if attributes.has_key?('id')
	idclass = "(#{idclass})" if idclass != ''

	style = attributes.has_key?('style') ? "{#{attributes['style']}}" : ""
	"#{idclass}#{style}"
	end

	def make_block_start_pair(tag, attributes)
	attributes = attrs_to_hash(attributes)
	class_style = build_styles_ids_and_classes(attributes)
	write("#{tag}#{class_style}. ")
	start_capture(tag)
	end

	def make_block_end_pair
	stop_capture_and_write
	write("\n\n")
	end

	def make_quicktag_start_pair(tag, wrapchar, attributes)
	attributes = attrs_to_hash(attributes)
	class_style = build_styles_ids_and_classes(attributes)
	write([" ", "#{wrapchar}#{class_style}"])
	start_capture(tag)
	end

	def make_quicktag_end_pair(wrapchar)
	stop_capture_and_write
	write([wrapchar, " "])
	end

	def write(d)
	if self.data_stack.size < 2
	self.result += d.to_a
	else
	self.data_stack[-1] += d.to_a
	end
	end

	def start_capture(tag)
	self.in_block = tag
	self.data_stack.push([])
	end

	def stop_capture_and_write
	self.in_block = false
	self.write(self.data_stack.pop)
	end

	def handle_data(data)
	write(normalise_space(data).strip) unless data.nil? or data == ''
	end

	%w[1 2 3 4 5 6].each do \|num\|
	define_method "start_h#{num}" do \|attributes\|
	make_block_start_pair("h#{num}", attributes)
	end

	define_method "end_h#{num}" do
	make_block_end_pair
	end
	end

	PAIRS = { 'blockquote' => 'bq', 'p' => 'p' }
	QUICKTAGS = { 'b' => '', 'strong' => '',
	'i' => '_', 'em' => '_', 'cite' => '??', 's' => '-',
	'sup' => '^', 'sub' => '~', 'code' => '@', 'span' => '%'}

	PAIRS.each do \|key, value\|
	define_method "start_#{key}" do \|attributes\|
	make_block_start_pair(value, attributes)
	end

	define_method "end_#{key}" do
	make_block_end_pair
	end
	end

	QUICKTAGS.each do \|key, value\|
	define_method "start_#{key}" do \|attributes\|
	make_quicktag_start_pair(key, value, attributes)
	end

	define_method "end_#{key}" do
	make_quicktag_end_pair(value)
	end
	end

	def start_ol(attrs)
	self.in_ol = true
	end

	def end_ol
	self.in_ol = false
	write("\n")
	end

	def start_ul(attrs)
	self.in_ul = true
	end

	def end_ul
	self.in_ul = false
	write("\n")
	end

	def start_li(attrs)
	if self.in_ol
	write("# ")
	else
	write("* ")
	end

	start_capture("li")
	end

	def end_li
	stop_capture_and_write
	write("\n")
	end

	def start_a(attrs)
	attrs = attrs_to_hash(attrs)
	self.a_href = attrs['href']

	if self.a_href:
	write(" \"")
	start_capture("a")
	end
	end

	def end_a
	if self.a_href:
	stop_capture_and_write
	write(["\":", self.a_href, " "])
	self.a_href = false
	end
	end

	def attrs_to_hash(array)
	array.inject({}) { \|collection, part\| collection[part[0].downcase] = part[1]; collection }
	end

	def start_img(attrs)
	attrs = attrs_to_hash(attrs)
	write([" !", attrs["src"], "! "])
	end

	def end_img
	end

	def start_tr(attrs)
	end

	def end_tr
	write("\|\n")
	end

	def start_td(attrs)
	write("\|")
	start_capture("td")
	end

	def end_td
	stop_capture_and_write
	write("\|")
	end

	def start_br(attrs)
	write("\n")
	end

	def unknown_starttag(tag, attrs)
	if @@permitted_tags.include?(tag)
	write(["<", tag])
	attrs.each do \|key, value\|
	if @@permitted_attributes.include?(key)
	write([" ", key, "=\"", value, "\""])
	end
	end
	end
	end

	def unknown_endtag(tag)
	if @@permitted_tags.include?(tag)
	write(["</", tag, ">"])
	end
	end

	# Return the textile after processing
	def to_textile
	result.join
	end

	# UNCONVERTED PYTHON METHODS
	#
	# def handle_charref(self, tag):
	# self._write(unichr(int(tag)))
	#
	# def handle_entityref(self, tag):
	# if self.entitydefs.has_key(tag):
	# self._write(self.entitydefs[tag])
	#
	# def handle_starttag(self, tag, method, attrs):
	# method(dict(attrs))
	#

	end
	# A parser for SGML, using the derived class as static DTD.

	class SGMLParser

	# Regular expressions used for parsing:
	Interesting = /[&<]/
	Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]\|#[0-9])?\|' +
	'<([a-zA-Z][^<>]\|/([a-zA-Z][^<>])?\|' +
	'![^<>]*)?')

	Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
	Charref = /&#([0-9]+)[^0-9]/

	Starttagopen = /<[>a-zA-Z]/
	Endtagopen = /<\/[<>a-zA-Z]/
	Endbracket = /[<>]/
	Special = /<![^<>]*>/
	Commentopen = /<!--/
	Commentclose = /--[ \t\n]*>/
	Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
	Attrfind = Regexp.compile('[\s,]([a-zA-Z_][a-zA-Z_0-9.-])' +
	'(\s=\s' +
	"('[^']*'" +
	'\|"[^"]*"' +
	'\|[-~a-zA-Z0-9,./:+%?!()_#=]))?')

	Entitydefs =
	{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

	def initialize(verbose=false)
	@verbose = verbose
	reset
	end

	def reset
	@rawdata = ''
	@stack = []
	@lasttag = '???'
	@nomoretags = false
	@literal = false
	end

	def has_context(gi)
	@stack.include? gi
	end

	def setnomoretags
	@nomoretags = true
	@literal = true
	end

	def setliteral(*args)
	@literal = true
	end

	def feed(data)
	@rawdata << data
	goahead(false)
	end

	def close
	goahead(true)
	end

	def goahead(_end)
	rawdata = @rawdata
	i = 0
	n = rawdata.length
	while i < n
	if @nomoretags
	handle_data(rawdata[i..(n-1)])
	i = n
	break
	end
	j = rawdata.index(Interesting, i)
	j = n unless j
	if i < j
	handle_data(rawdata[i..(j-1)])
	end
	i = j
	break if (i == n)
	if rawdata[i] == ?< #
	if rawdata.index(Starttagopen, i) == i
	if @literal
	handle_data(rawdata[i, 1])
	i += 1
	next
	end
	k = parse_starttag(i)
	break unless k
	i = k
	next
	end
	if rawdata.index(Endtagopen, i) == i
	k = parse_endtag(i)
	break unless k
	i = k
	@literal = false
	next
	end
	if rawdata.index(Commentopen, i) == i
	if @literal
	handle_data(rawdata[i,1])
	i += 1
	next
	end
	k = parse_comment(i)
	break unless k
	i += k
	next
	end
	if rawdata.index(Special, i) == i
	if @literal
	handle_data(rawdata[i, 1])
	i += 1
	next
	end
	k = parse_special(i)
	break unless k
	i += k
	next
	end
	elsif rawdata[i] == ?& #
	if rawdata.index(Charref, i) == i
	i += $&.length
	handle_charref($1)
	i -= 1 unless rawdata[i-1] == ?;
	next
	end
	if rawdata.index(Entityref, i) == i
	i += $&.length
	handle_entityref($1)
	i -= 1 unless rawdata[i-1] == ?;
	next
	end
	else
	raise RuntimeError, 'neither < nor & ??'
	end
	# We get here only if incomplete matches but
	# nothing else
	match = rawdata.index(Incomplete, i)
	unless match == i
	handle_data(rawdata[i, 1])
	i += 1
	next
	end
	j = match + $&.length
	break if j == n # Really incomplete
	handle_data(rawdata[i..(j-1)])
	i = j
	end
	# end while
	if _end and i < n
	handle_data(@rawdata[i..(n-1)])
	i = n
	end
	@rawdata = rawdata[i..-1]
	end

	def parse_comment(i)
	rawdata = @rawdata
	if rawdata[i, 4] != '<!--'
	raise RuntimeError, 'unexpected call to handle_comment'
	end
	match = rawdata.index(Commentclose, i)
	return nil unless match
	matched_length = $&.length
	j = match
	handle_comment(rawdata[i+4..(j-1)])
	j = match + matched_length
	return j-i
	end

	def parse_starttag(i)
	rawdata = @rawdata
	j = rawdata.index(Endbracket, i + 1)
	return nil unless j
	attrs = []
	if rawdata[i+1] == ?> #
	# SGML shorthand: <> == <last open tag seen>
	k = j
	tag = @lasttag
	else
	match = rawdata.index(Tagfind, i + 1)
	unless match
	raise RuntimeError, 'unexpected call to parse_starttag'
	end
	k = i + 1 + ($&.length)
	tag = $&.downcase
	@lasttag = tag
	end
	while k < j
	break unless rawdata.index(Attrfind, k)
	matched_length = $&.length
	attrname, rest, attrvalue = $1, $2, $3
	if not rest
	attrvalue = '' # was: = attrname
	elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
	(attrvalue[0] == ?" && attrvalue[-1] == ?")
	attrvalue = attrvalue[1..-2]
	end
	attrs << [attrname.downcase, attrvalue]
	k += matched_length
	end
	if rawdata[j] == ?> #
	j += 1
	end
	finish_starttag(tag, attrs)
	return j
	end

	def parse_endtag(i)
	rawdata = @rawdata
	j = rawdata.index(Endbracket, i + 1)
	return nil unless j
	tag = (rawdata[i+2..j-1].strip).downcase
	if rawdata[j] == ?> #
	j += 1
	end
	finish_endtag(tag)
	return j
	end

	def finish_starttag(tag, attrs)
	method = 'start_' + tag
	if self.respond_to?(method)
	@stack << tag
	handle_starttag(tag, method, attrs)
	return 1
	else
	method = 'do_' + tag
	if self.respond_to?(method)
	handle_starttag(tag, method, attrs)
	return 0
	else
	unknown_starttag(tag, attrs)
	return -1
	end
	end
	end

	def finish_endtag(tag)
	if tag == ''
	found = @stack.length - 1
	if found < 0
	unknown_endtag(tag)
	return
	end
	else
	unless @stack.include? tag
	method = 'end_' + tag
	unless self.respond_to?(method)
	unknown_endtag(tag)
	end
	return
	end
	found = @stack.index(tag) #or @stack.length
	end
	while @stack.length > found
	tag = @stack[-1]
	method = 'end_' + tag
	if respond_to?(method)
	handle_endtag(tag, method)
	else
	unknown_endtag(tag)
	end
	@stack.pop
	end
	end

	def parse_special(i)
	rawdata = @rawdata
	match = rawdata.index(Endbracket, i+1)
	return nil unless match
	matched_length = $&.length
	handle_special(rawdata[i+1..(match-1)])
	return match - i + matched_length
	end

	def handle_starttag(tag, method, attrs)
	self.send(method, attrs)
	end

	def handle_endtag(tag, method)
	self.send(method)
	end

	def report_unbalanced(tag)
	if @verbose
	print '*** Unbalanced </' + tag + '>', "\n"
	print '*** Stack:', self.stack, "\n"
	end
	end

	def handle_charref(name)
	n = Integer(name)
	if !(0 <= n && n <= 255)
	unknown_charref(name)
	return
	end
	handle_data(n.chr)
	end

	def handle_entityref(name)
	table = Entitydefs
	if table.include?(name)
	handle_data(table[name])
	else
	unknown_entityref(name)
	return
	end
	end

	def handle_data(data)
	end

	def handle_comment(data)
	end

	def handle_special(data)
	end

	def unknown_starttag(tag, attrs)
	end
	def unknown_endtag(tag)
	end
	def unknown_charref(ref)
	end
	def unknown_entityref(ref)
	end

	end