Last active
August 29, 2015 14:03
-
-
Save jcoglan/b9f13b6eb0a78f102634 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
class Html2Md | |
class Link < Struct.new(:href, :title) | |
end | |
class Format < Struct.new(:name, :head, :body, :opened) | |
end | |
WIDTH = 80 | |
def initialize(markup) | |
@markup = markup | |
@doc = Nokogiri::HTML.fragment(markup) | |
@width = WIDTH | |
end | |
def to_markdown | |
@buffer = '' | |
@stack = [] | |
@links = [] | |
traverse(@doc) | |
@links.each_with_index do |link, i| | |
@buffer << "[#{i+1}]: #{link.href}" | |
@buffer << " (#{link.title})" if link.title | |
@buffer << "\n" | |
end | |
@buffer | |
end | |
def traverse(doc) | |
doc.children.each { |node| visit(node) } | |
end | |
def visit(node) | |
__send__ "visit_#{node.name}", node | |
end | |
(1..6).each do |n| | |
class_eval %Q{ | |
def visit_h#{n}(node) | |
prefix = '#' * #{n} + ' ' | |
@stack << Format.new('h#{n}', prefix, prefix, false) | |
@buffer << format_block(node) | |
@stack.pop | |
newlines(1) | |
end | |
} | |
end | |
def visit_p(node) | |
@stack << Format.new('p', '', '', false) | |
@buffer << format_block(node) | |
@stack.pop | |
newlines(1) | |
end | |
def visit_pre(node) | |
@block = @pre = '' | |
traverse(node) | |
if node.inner_html =~ /<span / | |
@buffer << "<pre>" | |
@buffer << @block | |
@buffer << "</pre>\n\n" | |
else | |
@buffer << "```\n" | |
@buffer << @block | |
@buffer << "\n```\n\n" | |
end | |
@block = @pre = nil | |
end | |
def visit_code(node) | |
@code = true | |
result = @pre ? traverse(node) : visit_tt(node) | |
@code = false | |
result | |
end | |
def visit_blockquote(node) | |
@stack << Format.new('blockquote', '> ', '> ', false) | |
traverse(node) | |
@stack.pop | |
newlines(1) | |
end | |
def visit_cite(node) | |
@block << '<cite>' | |
traverse(node) | |
@block << '</cite>' | |
end | |
def visit_iframe(node) | |
@buffer << node.to_html | |
newlines(2) | |
end | |
alias :visit_object :visit_iframe | |
alias :visit_style :visit_iframe | |
def visit_ul(node) | |
traverse(node) | |
newlines(1) | |
end | |
def visit_ol(node) | |
@list_index = 1 | |
traverse(node) | |
@list_index = nil | |
newlines(1) | |
end | |
def visit_li(node) | |
leader = @list_index ? "#{@list_index}. " : '- ' | |
@stack << Format.new('li', leader, ' ', false) | |
@list_index += 1 if @list_index | |
@buffer << format_block(node) | |
@stack.pop | |
end | |
def visit_a(node) | |
@links << Link.new(node['href'], node['title']) | |
@block << "[" | |
traverse(node) | |
@block << "][#{@links.size}]" | |
end | |
def visit_img(node) | |
@links << Link.new(node['src'], node['title']) | |
@block << "![" | |
@block << (node['alt'] || '') | |
@block << "][#{@links.size}]" | |
end | |
def visit_em(node) | |
delim = node.text =~ /\*/ ? '_' : '*' | |
@block << delim | |
traverse(node) | |
@block << delim | |
end | |
def visit_strong(node) | |
delim = node.text =~ /\*/ ? '__' : '**' | |
@block << delim | |
traverse(node) | |
@block << delim | |
end | |
def visit_i(node) | |
@block << '<i>' | |
traverse(node) | |
@block << '</i>' | |
end | |
def visit_b(node) | |
@block << '<b>' | |
traverse(node) | |
@block << '</b>' | |
end | |
def visit_del(node) | |
@block << '<del>' | |
traverse(node) | |
@block << '</del>' | |
end | |
def visit_ins(node) | |
@block << '<ins>' | |
traverse(node) | |
@block << '</ins>' | |
end | |
def visit_tt(node) | |
delim = node.text =~ /`/ ? '``' : '`' | |
@block << delim | |
@code = true | |
traverse(node) | |
@code = false | |
@block << delim | |
end | |
def visit_br(node) | |
@block << "\n" | |
end | |
def visit_span(node) | |
@block << %Q{<span class="#{node['class']}">} | |
traverse(node) | |
@block << '</span>' | |
end | |
def visit_text(node) | |
text = @pre ? node.text : node.text.gsub(/[ \n]+/, ' ').gsub(' - ', ' -- ') | |
@block << text if @block | |
end | |
def format_block(node) | |
@block = '' | |
traverse(node) | |
return '' if @block.nil? | |
result = @block.strip + ' ' | |
@block = nil | |
width = @width - @stack.inject(0) { |s,f| s + f.body.length } | |
lines = result.scan(/.{1,#{width}}(?: +|\n)/).map do |line| | |
prefix = @stack.map { |s| s.opened ? s.body : s.head } | |
@stack.each { |s| s.opened = true } | |
prefix.join('') + line.gsub(/ *$/, '').gsub(/\n$/, ' ') | |
end | |
(lines + ['']).join("\n") | |
end | |
def newlines(n) | |
prefix = @stack.map { |s| s.body }.join('') | |
n.times { @buffer << prefix + "\n" } | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment