Last active
December 9, 2018 09:52
-
-
Save maraigue/d2971e8b3e4165fe4f639defe64aee2f to your computer and use it in GitHub Desktop.
はてなグループからエクスポートしたXMLファイルを、MovableType形式に変換する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# -*- coding: utf-8 -*- | |
require 'text/hatena' | |
require 'rexml/document' | |
AUTHOR = "BlogAuthor" # 著者名を入力してください | |
def check_node(node, expected_name) | |
case node | |
when REXML::Element | |
# check the name | |
unless expected_name === node.name | |
STDERR.puts "ERROR: tag name \"#{node.name}\" (expected tag name: #{expected_name.inspect})" | |
exit | |
end | |
true | |
when REXML::Text | |
if node.value =~ /\A\s*\z/ | |
# do nothing: just skip it | |
false | |
else | |
STDERR.puts "ERROR: unexpected text node: #{node.value.inspect}" | |
exit | |
end | |
else | |
STDERR.puts "[XML_INFO #{node.class}] #{node}" | |
false | |
end | |
end | |
$parser = nil | |
def hatena2html(text) | |
unless $parser | |
$parser = Text::Hatena.new | |
end | |
text_dup = text.dup | |
text_dup.force_encoding("ASCII-8BIT") | |
$parser.parse(text_dup) | |
html = $parser.html.force_encoding("ASCII-8BIT").strip | |
if html[0..20] == '<div class="section">'.force_encoding("ASCII-8BIT") && html[-6..-1] == '</div>'.force_encoding("ASCII-8BIT") | |
html = html[21..-7] | |
else | |
STDERR.puts "<div class=\"section\"> expected but not found" | |
exit | |
end | |
html | |
end | |
def main(fname) | |
buf = nil | |
open fname, "r:utf-8" do |f| | |
buf = f.read | |
end | |
doc = REXML::Document.new(buf) | |
doc.each do |diary| | |
next unless check_node(diary, "diary") | |
STDERR.puts "Number of diary entries: #{diary.size}" | |
diary.each do |day| | |
next unless check_node(day, "day") | |
date = day.attribute("date").value.strip | |
title = day.attribute("title").value.strip | |
bodies = day.elements["body"] | |
if bodies.size != 1 | |
STDERR.puts "ERROR: multiple bodies" | |
exit | |
end | |
body = bodies[0] | |
comments = day.elements["comments"] | |
STDERR.puts "Processing #{date} #{title} ..." | |
hatena_bodies = body.value.split(/\n(?=\*[^\*])/) | |
hatena_bodies.shift if hatena_bodies.first.empty? | |
second = 59 | |
hatena_bodies.each do |hb| | |
#STDERR.puts " #{hb[0..40].gsub(/\s+/, " ")}" | |
unless hb =~ /\A\*([^\*]+)\*/ | |
STDERR.puts "Title not given:\n#{hb}" | |
exit | |
end | |
hb_key = $1 | |
hb_entry = $'.strip | |
if hb_entry =~ /\n/ | |
hb_title = $` | |
hb_body = $' | |
else | |
STDERR.puts "Body not found: #{hb_entry}" | |
exit | |
end | |
hb_time = nil | |
if hb_key =~ /\A\d{9,}\z/ | |
hb_time = Time.at(Integer(hb_key)).localtime | |
else | |
if date =~ /\A(\d{4})-0*(\d+)-0*(\d+)\z/ | |
hb_time = Time.local(Integer($1), Integer($2), Integer($3), 23, 59, second) | |
second -= 1 | |
else | |
STDERR.puts "Invalid date format: #{date}" | |
exit | |
end | |
end | |
hb_tags = [] | |
while hb_title =~ /\A\[([^\]]+)\]/ | |
hb_tags << $1 | |
hb_title = $' | |
end | |
hb_body.gsub!(/-----\n/, "-----<span></span>\n") | |
hb_body_ext = nil | |
if hb_body =~ /^=====?$/ | |
hb_body_ext = $' | |
hb_body = $` | |
end | |
puts <<MTENT | |
AUTHOR: #{AUTHOR} | |
TITLE: #{hb_title} | |
DATE: #{hb_time.strftime("%m/%d/%Y %r")} | |
MTENT | |
hb_tags.each do |tag| | |
puts "CATEGORY: #{tag}" | |
end | |
puts "-----" | |
puts "BODY:" | |
puts hatena2html(hb_body) | |
puts "-----" | |
if hb_body_ext | |
puts "EXTENDED BODY:" | |
puts hatena2html(hb_body_ext) | |
puts "-----" | |
end | |
puts "--------" | |
end | |
end | |
end | |
end # end of "def main" | |
if ARGV.empty? | |
STDERR.puts "Usage: hatenagroup2movabletype.rb FILES..." | |
exit | |
end | |
ARGV.each do |fname| | |
main fname | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment