Created
December 3, 2013 15:15
-
-
Save redraiment/7770887 to your computer and use it in GitHub Desktop.
iKnowledge 1.0 HTML格式自动转换成2.0 Markdown格式
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# -*- coding: utf-8 -*- | |
require 'cgi' | |
module Html | |
def self.tag(content) | |
n, a = content.split /\s+/, 2 | |
{:tag => n}.update(attr: Hash[(a or '').scan(/(\w+)="([^"]*)"/)]) | |
end | |
def self.yacc(token) | |
doc = [] | |
loop do | |
e = token.next | |
break if e.start_with? '</' | |
if !e.start_with? '<' | |
doc << { | |
:tag => '#text', | |
:text => CGI.unescapeHTML(e) | |
.gsub("“", "“") | |
.gsub("”", "”") | |
.gsub(" ", " ") | |
.gsub("…", "…") | |
} | |
elsif e.end_with? '/>' | |
doc << tag(e[1..-3]) | |
else | |
doc << tag(e[1..-2]).update(node: yacc(token)) | |
end | |
end | |
doc | |
end | |
def self.lex(content) | |
content.split(/(?:(?=<)|(?<=>))/).to_enum | |
end | |
def self.parse(xml) | |
yacc lex xml | |
end | |
end | |
class Markdown | |
attr :date | |
def initialize(html) | |
@title = html[:attr]['title'] | |
@timestamp = html[:node][1][:node][0][:text] | |
@date = @timestamp.split(/\s+/)[0] | |
@tags = html[:node][2..-1] | |
end | |
def front_matter | |
<<EOF | |
--- | |
layout: article | |
title: #@title | |
date: #@timestamp | |
category: | |
excerpt: | |
--- | |
EOF | |
end | |
def md(tags, prefix = "") | |
tags.map do |e| | |
case e[:tag] | |
when '#text' | |
e[:text].strip | |
when /h([1-6])/ | |
'#' * $~[1].to_i + ' ' + md(e[:node]) + "\n\n" | |
when 'p' | |
(prefix == "" ? '' : ' ' * 4) + md(e[:node]) + "\n\n" | |
when 'a' | |
'[' + md(e[:node]) + '](' + e[:attr]['href'] + ')' | |
when 'span' | |
'[' + md(e[:node]) + '](' + e[:attr]['title'] + ')' | |
when 'i', 'b', 'strong' | |
'*' + md(e[:node]) + '*' | |
when 'pre' | |
content = md(e[:node]) | |
if e[:node][0][:tag] == '#text' | |
content.gsub!(/^/m, ' ' * (prefix == "" ? 4 : 8)) | |
end | |
content + "\n\n" | |
when 'blockquote' | |
content = md(e[:node]).rstrip | |
if e[:node][0][:tag] == '#text' | |
content.gsub!(/^/m, '> ') | |
end | |
content + "\n\n" | |
when 'code' | |
if e[:attr] && e[:attr].key?('language') | |
'```' + e[:attr]['language'].sub('shell', 'bash') + "\n" + md(e[:node]) + "\n```" | |
else | |
'`' + md(e[:node]) + '`' | |
end | |
when 'ol' | |
md(e[:node], prefix + '1. ') + "\n" | |
when 'ul' | |
md(e[:node], prefix + '+ ') + "\n" | |
when 'li' | |
prefix + md(e[:node], ' ' + prefix) + "\n" | |
when 'dl' | |
"<dl>\n" + md(e[:node], ' ') + "</dl>\n\n" | |
when 'dt', 'dd' | |
"#{prefix}<#{e[:tag]}>#{CGI.escapeHTML(md(e[:node]))}</#{e[:tag]}>\n" | |
when 'img' | |
'{% img ' + File.basename(e[:attr]['src']) + " %}\n\n" | |
when 'sub', 'sup' | |
"<#{e[:tag]}>#{CGI.escapeHTML(md(e[:node]))}</#{e[:tag]}>" | |
else | |
puts "Unknow: #{e}" | |
exit 1 | |
end | |
end.join '' | |
end | |
def to_s | |
front_matter + md(@tags) | |
end | |
end | |
class IKnowledge | |
def initialize(filename) | |
@home = File.join("/", "Users", "redraiment", "Documents", "zzp.me") | |
html = Html.parse File.read filename | |
@md = Markdown.new html[0] | |
@filename = @md.date + '-' + File.basename(filename).sub('.html', '.md') | |
end | |
def save | |
fn = File.join(@home, "_posts", @filename) | |
File.open(fn, 'w') do |f| | |
f.write self.to_s | |
end | |
puts "save to #{fn}" | |
end | |
def to_s | |
@md.to_s | |
end | |
end | |
ikn = IKnowledge.new ARGV[0] | |
ikn.save |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment