Created
August 25, 2011 21:18
-
-
Save lilith/1172001 to your computer and use it in GitHub Desktop.
Wordpress Importer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
#1) Synchronize your blog comments with IntenseDebate | |
#2) Export an XML file from your existing blog. Leave it running, so that attachments can be downloaded. | |
#3) Rename the .xml file to 'wordpress.xml' and place in root of the new nesta website. | |
#4) Run the import | |
# This does a fast 'offline import'. Use this first to verify there are no errors | |
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process(true)' | |
# Use this to perform a full import. Downloads all attachments and checks for 301/302 redirects to determine what Alternate Urls need to be specified for each page | |
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process()' | |
# If you want to add the "Author", "wp_date", or "wp_template" metadata fields, pass in an empty array to the drop_metadata parameter | |
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process(true, "wordpress.xml",[])' | |
# Or, drop everything | |
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process(true, "wordpress.xml",["Author","wp_date","wp_template", "Alternate Urls", "Atom ID", "Post ID", "Status", "Categories", "Tags"])' | |
#5) URL rewrite /wp-content/ to /attachments/wp-content/ | |
#6) URL redirect from each space-delimited value in 'Alternate Urls' of each page metadata to the actual page path | |
#7) Configure IntenseDebate to use the page.metadata["Post ID"] for the idcomments_post_id value (if present) | |
#<script> | |
#var idcomments_acct = ‘YOUR ACCT ID’; | |
#var idcomments_post_id; //<- this is where you use "Post ID" | |
#var idcomments_post_url; | |
#</script> | |
#<script type=”text/javascript” src=”http://www.intensedebate.com/js/genericLinkWrapperV2.js”></script> | |
#8) Add code to hide any page where there is a value for page.metadata['status']. Nesta doesn't do this. | |
#9) Hand-correct invalid html | |
# Notes | |
# - all non-public blog articles will be placed in the /content/pages/drafts folder | |
# - all pages without an assigned URL will be placed in /content/pages/drafts | |
# - pages with an assigned URL (even if they are private or draft) go where they were in Wordpress, just with "status: private" etc. | |
# - public blog articles will be placed in /content/pages/blog/YYYY/blog-title-sanitized.mdown | |
# - index.mdown files are not generated. Instead, you'll get 'name.mdown' and folder 'name' in the same directory. | |
# - All Wordpress user-defined fields in posts and pages will be added at the end of the file as XML comments | |
# - Password-protected pages/posts will be given a status of 'protected', but the password will not be copied. | |
# - Sticky posts, comment status, menu order, post parents, and ping status values are ignored. | |
# TODO (perhaps) | |
# - Invalid HTML is not 'tidied up', and REXML chokes on it | |
# - Tags and categories from Wordpress are imported as-is. Maybe these need to be combined or something, haven't figured out how exactly I'm mapping them to nesta's design. | |
# - | |
require 'rubygems' | |
require 'net/http' | |
require 'uri' | |
require 'hpricot' | |
require 'fileutils' | |
require 'yaml' | |
require 'time' | |
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process("wordpress.xml")' | |
module Nesta | |
# This importer takes a wordpress.xml file, which can be exported from your | |
# wordpress.com blog (/wp-admin/export.php). | |
class WordpressImport | |
# Returns a list of URLs (including the one passed in) based on HTTP redirects that occured. | |
def get_redirections(url, limit = 10) | |
return [] if limit == 0 #Prevent infinite loops | |
url_list = [url] | |
uri = URI.parse(url) | |
Net::HTTP.start(uri.host,uri.port){ |http| | |
http.request_get(strip_domain(url)){ |res| | |
case res | |
when Net::HTTPSuccess | |
return url_list #We are done! | |
when Net::HTTPRedirection | |
return url_list.concat(get_redirections(res.header['location'] , limit - 1)).uniq | |
else | |
return url_list | |
end | |
} | |
} | |
rescue | |
puts "Failed to reach #{url}, #{$!.to_s}" | |
return [url] | |
end | |
# Downloads a file, correctly following any redirections required. | |
def download_file(url,dest_path, limit = 10) | |
raise ArgumentError, 'HTTP redirect too deep' if limit == 0 | |
puts "Downloading #{url} to #{make_relative(dest_path)}" | |
uri = URI.parse(url) | |
Net::HTTP.start(uri.host,uri.port){ |http| | |
http.request_get(strip_domain(url)){ |res| | |
case res | |
when Net::HTTPSuccess | |
#On success, buffer to disk | |
File.open(dest_path,'w'){ |f| | |
res.read_body{ |seg| | |
f << seg | |
#hack -- adjust to suit: | |
sleep 0.005 | |
} | |
} | |
return true | |
#Follow redirects | |
when Net::HTTPRedirection | |
return download_file(res.header['location'] ,dest_path, limit - 1) | |
else | |
response.error! | |
return false | |
end | |
} | |
} | |
rescue | |
puts "Failed to reach #{url}, #{$!.to_s}" | |
end | |
def download(url, dest_path, upload_date) | |
return false if File.exists?(dest_path) | |
FileUtils.mkdir_p(File.dirname(dest_path)) if not Dir.exists?(File.dirname(dest_path)) | |
if download_file(url,dest_path) | |
File.utime(upload_date,upload_date,dest_path) | |
end | |
return true | |
end | |
def make_relative(path) | |
here = File.expand_path('.', ::File.dirname(__FILE__)) | |
return (path.start_with?(here)) ? path[here.length..-1] : path | |
end | |
def strip_domain(url) | |
uri = URI.parse(url) | |
return uri.path + ((uri.query.nil? || uri.query.empty?) ? "" : "?#{uri.query}") | |
end | |
def get_domain(url) | |
uri = URI.parse(url) | |
port = uri.port.nil? ? "" : ":#{uri.port}" | |
return "http://#{uri.host}#{port}" | |
end | |
def get_alternate_urls(item, metadata, offline=false) | |
#Build an array of the 4 data sources: wp:post_id, link, guid, and metadata 'url' | |
post_id = item.at('wp:post_id').inner_text | |
urls = [item.at('link').inner_text, | |
item.at('guid').inner_text, | |
get_domain(item.at('link').inner_text) + "/?p=#{post_id}"] | |
meta_url = metadata["url"] | |
urls.push('/' + meta_url.gsub(/^\//,"")) if not meta_url.nil? | |
#Cleanse array | |
urls = urls.uniq.reject{|i| i.nil? || i.empty?} | |
#Use HTTP requests to capture any redirections. | |
short_urls = [] | |
urls.each{|v| | |
if offline | |
short_urls.push(strip_domain(v)) | |
else | |
get_redirections(v).each{|url| | |
short_urls.push(strip_domain(url)) | |
} | |
end | |
} | |
#strip domains, duplicates, and remove empty values | |
return short_urls.uniq.reject{|i| i.nil? || i.empty?} | |
end | |
def self.process(offline=false, filename = "wordpress.xml",drop_metadata=["Author","wp_date","wp_template"]) | |
WordpressImport.new.import(offline, filename,drop_metadata) | |
end | |
def import(offline=false, filename = "wordpress.xml", drop_metadata=["Author","wp_date","wp_template"]) | |
import_count = Hash.new(0) | |
doc = Hpricot::XML(File.read(filename)) | |
## Where to store the attachements. We'll need a URL rewriting rule to change '/wp-content' -> '/attachments/wp-content' | |
attachment_dir = File.expand_path('content/attachments', ::File.dirname(__FILE__)) | |
# Where to store posts and pages | |
content_dir = File.expand_path('content/pages', ::File.dirname(__FILE__)) | |
#A hash to detect duplicate URLs in the XML | |
items_by_urls = {} | |
authors = {} | |
#Build hash of login->display name for authors | |
(doc/:channel).first.get_elements_by_tag_name('wp:author').each do |author| | |
author_login = author.at('wp:author_login').inner_text.strip | |
author_name = author.at('wp:author_display_name').inner_text.strip | |
authors[author_login] = author_name | |
puts "Author #{author_login} will be mapped to #{author_name}" | |
end | |
(doc/:channel/:item).each do |item| | |
#Get the item title | |
title = item.at(:title).inner_text.strip | |
#Get post_id | |
post_id = item.at('wp:post_id').inner_text | |
puts "Importing #{post_id} - #{title}" | |
#Item type: post, page, or attachment | |
type = item.at('wp:post_type').inner_text | |
#GMT posted date - always available for attachements, but not always for posts/pages that have never been published. | |
#Fall back to post_date when we get an ArgumentError - will be off by an unknown timezone, but date should be correct. | |
post_date_gmt = Time.parse(item.at('wp:post_date_gmt').inner_text) rescue Time.parse(item.at('wp:post_date').inner_text) | |
#Download attachments unless they already exist | |
if type == "attachment" | |
a_url = item.at('wp:attachment_url').inner_text | |
if offline | |
puts "(Offline) Skipping #{a_url}" | |
else | |
download(a_url, attachment_dir + strip_domain(a_url), post_date_gmt) | |
end | |
elsif | |
#Parse metadata into a hash | |
metas = Hash[item.search("wp:postmeta").map{ |meta| [meta.at('wp:meta_key').inner_text, meta.at('wp:meta_value').inner_text]}] | |
#The template used by wordpress. Can be used to allow regex migrations to Nesta templates | |
wp_template = metas["_wp_page_template"] | |
#Discard all meta keys starting with '_', and all empty meta values | |
metas.reject!{|k,v|k[0] == "_" || v.empty?} | |
#Parse tags and categories | |
tags = (item/:category).reject{|c| c.attributes['domain'] != 'post_tag'}.map{|c| c.attributes['nicename']}.uniq | |
categories = (item/:category).reject{|c| c.attributes['domain'] != 'category'}.map{|c| c.attributes['nicename']}.reject{|c| c == 'uncategorized'}.uniq | |
#Calculate the status of the page or post: publish, draft, pending, private, or protected | |
status = item.at('wp:status').inner_text | |
status = "protected" if not item.at('wp:post_password').inner_text.empty? | |
is_public = status == "publish" | |
#Get the slug, fallback to normalized title, then fallback to post ID. | |
post_name = item.at('wp:post_name').inner_text | |
post_name = title.downcase.split.join('-') if post_name.empty? | |
post_name = post_id if post_name.empty? | |
#Sanitize | |
post_name = post_name.gsub(/[-]?[^A-Za-z0-9-]+[-]?/,"-").gsub(/^[-]+/,"").gsub(/[-]+$/,"") | |
puts "\n\n\n#{post_name}\n\n\n" if post_name.include?("/") | |
#Calculate the location for the .mdown file | |
link_uri = URI.parse(item.at('link').inner_text) | |
old_path_query = strip_domain(item.at('link').inner_text) | |
#The path (no domain, no port, no querystring) of the item's active URL. No trailing or leading slashes | |
new_path = link_uri.path.gsub(/^\//,"").gsub(/\/$/,"") | |
if type == "page" | |
#Un-named pages go in the drafts folder, regardless of their status. | |
#Named, but status=draft, private, protected, pending posts simply get flagged, not renamed | |
if new_path.empty?; | |
new_path = "drafts/#{post_name}" | |
elsif !is_public | |
puts "Page #{new_path} has a status of #{status}. Please review file." | |
end | |
elsif type == "post" | |
#Only public articles go into /blog/ | |
if is_public | |
new_path = "blog/#{post_date_gmt.year}/#{post_name}" | |
puts "Article #{old_path_query} was placed at #{new_path}" | |
else | |
new_path = "drafts/#{post_name}" | |
puts "(#{status} article #{old_path_query} was placed at #{new_path}" | |
end | |
end | |
short_new_path = new_path | |
#Add dir and extension | |
new_path = "#{content_dir}/#{new_path}.mdown" | |
#Acquire a list of all the URLs that may have been used to link to this post or page, so we can add redirects later | |
#Exclude any duplicates with previous files - first come, first serve. | |
alternate_urls = get_alternate_urls(item,metas, offline).reject{|u| | |
if items_by_urls.has_key?(u) | |
puts "Duplicate URL '#{u}' used by more than one item - will not be added" | |
puts "Current: #{short_new_path}, First item: #{items_by_urls[u]}" | |
puts "" | |
true | |
else | |
items_by_urls[u] = short_new_path | |
false | |
end | |
} | |
#Convert post_id to an int unless its a string (to avoid the quotes) | |
post_id = post_id.to_i if post_id.match(/^\d+$/) | |
#Generate metadata table for new file | |
metadata = { | |
"Alternate Urls" => alternate_urls * " ", | |
"Atom ID" => item.at('guid').inner_text.strip, | |
"Post ID" => post_id, | |
"Author" => authors[item.at('dc:creator').inner_text.strip], | |
"wp_date" => post_date_gmt.to_s, | |
"wp_template" => wp_template, | |
"Summary" => item.at('excerpt:encoded').inner_text.strip, | |
"Status" => status == "publish" ? "" : status, | |
"Categories" => categories * ", ", | |
"Tags" => tags * ", " | |
} | |
#Dont' add any values that are empty | |
metadata.reject!{|k,v| v.nil? or v.to_s.empty?} | |
#Articles/posts (not pages) get a 'Date' value - this is what Nesta uses to differentiate them | |
metadata["Date"] = post_date_gmt.strftime("%b %e %Y").gsub(" "," ") if type == "post" | |
#Make sure metadata uses string keys | |
#metadata = metadata.inject({}){|memo,(k,v)| memo[k.to_s] = v; memo} | |
#drop the excluded metadata | |
metadata.reject!{|k,v| drop_metadata.include?(k)} | |
#Create file | |
FileUtils.mkdir_p File.dirname(new_path) if not Dir.exists?(File.dirname(new_path)) | |
File.open(new_path, "w") do |f| | |
f.puts metadata.to_yaml.gsub(/^---\s*/,"") #Strip leading dashes | |
f.puts "\n##{title}\n\n" | |
f.puts item.at('content:encoded').inner_text | |
f.puts "\n" | |
metas.each { |key, value| | |
f.puts "<!--#{key.gsub(/-/,"-")}: #{value.gsub(/-/,"-")}-->\n" | |
} | |
end | |
end | |
import_count[type] += 1 | |
end | |
import_count.each do |key, value| | |
puts "Imported #{value} #{key}s" | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment