-
-
Save skopp/5687139 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# html2markdown.rb | |
require 'nokogiri' | |
module Html2Markdown | |
class HtmlDocument < Nokogiri::XML::SAX::Document | |
MAP = { "b" => "strong", "i" => "em" } | |
def initialize() | |
@list = nil | |
@list_depth = 0 | |
@ol_count = 0 | |
@element = nil | |
@capture_buffer = "" | |
super() | |
end | |
def captured | |
@capture_buffer | |
end | |
def characters(string) | |
if %[p li h1 h2 h3 h4 h5 strong b i em].include?(@element) | |
@capture_buffer += string | |
else | |
$stderr.puts "Would have added #{string}, but do not know #{@element}" | |
end | |
end | |
def start_element(name, attributes) | |
name.downcase! | |
name = MAP[name] if MAP.has_key?(name) | |
@element = name | |
method = :"handle_#{name}" | |
return unless self.respond_to?(method) | |
if self.method(method).arity > 0 | |
self.send(method, attributes) | |
else | |
self.send(method) | |
end | |
end | |
def end_element(name) | |
name.downcase! | |
name = MAP[name] if MAP.has_key?(name) | |
method = :"end_#{name}" | |
if !self.respond_to?(method) | |
method = :"handle_#{name}" | |
return unless self.respond_to?(method) | |
end | |
self.send(method) | |
end | |
def handle_p | |
end | |
alias_method :handle_br, :handle_p | |
def end_p | |
@capture_buffer += "\n\n" | |
end | |
def end_br | |
@capture_buffer += "\n" | |
end | |
def handle_strong | |
@capture_buffer += "**" | |
end | |
def handle_em | |
@capture_buffer += "*" | |
end | |
1.upto(5) do |i| | |
header = "#" * i | |
self.send(:define_method, :"handle_h#{i}") do | |
@capture_buffer += "#{header} " | |
end | |
self.send(:define_method, :"end_h#{i}") do | |
@capture_buffer += "\n" | |
end | |
end | |
def handle_code | |
@capture_buffer += "`" | |
end | |
def handle_ul | |
@list = :ul | |
@list_depth += 1 | |
end | |
def end_ul | |
@list_depth -= 1 | |
@ol_count = 0 if @list == :ol | |
end | |
alias_method :end_ol, :end_ul | |
def handle_ol | |
@list = :ol | |
@list_depth += 1 | |
end | |
def handle_li | |
indent = " " * (@list_depth - 1) | |
mark = @list == :ul ? "* " : "#{@ol_count += 1} " | |
@capture_buffer += indent | |
@capture_buffer += mark | |
end | |
def end_li | |
@capture_buffer += "\n" | |
end | |
end | |
def self.convert(object) | |
document = HtmlDocument.new() | |
parser = Nokogiri::HTML::SAX::Parser.new(document) | |
parser.parse(object) | |
return document.captured | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment