Skip to content

Instantly share code, notes, and snippets.

@lilith
Created August 25, 2011 21:18
Show Gist options
  • Save lilith/1172001 to your computer and use it in GitHub Desktop.
Save lilith/1172001 to your computer and use it in GitHub Desktop.
Wordpress Importer
# coding: utf-8
#1) Synchronize your blog comments with IntenseDebate
#2) Export an XML file from your existing blog. Leave it running, so that attachments can be downloaded.
#3) Rename the .xml file to 'wordpress.xml' and place in root of the new nesta website.
#4) Run the import
# This does a fast 'offline import'. Use this first to verify there are no errors
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process(true)'
# Use this to perform a full import. Downloads all attachments and checks for 301/302 redirects to determine what Alternate Urls need to be specified for each page
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process()'
# If you want to add the "Author", "wp_date", or "wp_template" metadata fields, pass in an empty array to the drop_metadata parameter
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process(true, "wordpress.xml",[])'
# Or, drop everything
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process(true, "wordpress.xml",["Author","wp_date","wp_template", "Alternate Urls", "Atom ID", "Post ID", "Status", "Categories", "Tags"])'
#5) URL rewrite /wp-content/ to /attachments/wp-content/
#6) URL redirect from each space-delimited value in 'Alternate Urls' of each page metadata to the actual page path
#7) Configure IntenseDebate to use the page.metadata["Post ID"] for the idcomments_post_id value (if present)
#<script>
#var idcomments_acct = ‘YOUR ACCT ID’;
#var idcomments_post_id; //<- this is where you use "Post ID"
#var idcomments_post_url;
#</script>
#<script type=”text/javascript” src=”http://www.intensedebate.com/js/genericLinkWrapperV2.js”></script>
#8) Add code to hide any page where there is a value for page.metadata['status']. Nesta doesn't do this.
#9) Hand-correct invalid html
# Notes
# - all non-public blog articles will be placed in the /content/pages/drafts folder
# - all pages without an assigned URL will be placed in /content/pages/drafts
# - pages with an assigned URL (even if they are private or draft) go where they were in Wordpress, just with "status: private" etc.
# - public blog articles will be placed in /content/pages/blog/YYYY/blog-title-sanitized.mdown
# - index.mdown files are not generated. Instead, you'll get 'name.mdown' and folder 'name' in the same directory.
# - All Wordpress user-defined fields in posts and pages will be added at the end of the file as XML comments
# - Password-protected pages/posts will be given a status of 'protected', but the password will not be copied.
# - Sticky posts, comment status, menu order, post parents, and ping status values are ignored.
# TODO (perhaps)
# - Invalid HTML is not 'tidied up', and REXML chokes on it
# - Tags and categories from Wordpress are imported as-is. Maybe these need to be combined or something, haven't figured out how exactly I'm mapping them to nesta's design.
# -
require 'rubygems'
require 'net/http'
require 'uri'
require 'hpricot'
require 'fileutils'
require 'yaml'
require 'time'
# ruby -r './import.rb' -e 'Nesta::WordpressImport.process("wordpress.xml")'
module Nesta
# This importer takes a wordpress.xml file, which can be exported from your
# wordpress.com blog (/wp-admin/export.php).
class WordpressImport
# Returns a list of URLs (including the one passed in) based on HTTP redirects that occured.
def get_redirections(url, limit = 10)
return [] if limit == 0 #Prevent infinite loops
url_list = [url]
uri = URI.parse(url)
Net::HTTP.start(uri.host,uri.port){ |http|
http.request_get(strip_domain(url)){ |res|
case res
when Net::HTTPSuccess
return url_list #We are done!
when Net::HTTPRedirection
return url_list.concat(get_redirections(res.header['location'] , limit - 1)).uniq
else
return url_list
end
}
}
rescue
puts "Failed to reach #{url}, #{$!.to_s}"
return [url]
end
# Downloads a file, correctly following any redirections required.
def download_file(url,dest_path, limit = 10)
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
puts "Downloading #{url} to #{make_relative(dest_path)}"
uri = URI.parse(url)
Net::HTTP.start(uri.host,uri.port){ |http|
http.request_get(strip_domain(url)){ |res|
case res
when Net::HTTPSuccess
#On success, buffer to disk
File.open(dest_path,'w'){ |f|
res.read_body{ |seg|
f << seg
#hack -- adjust to suit:
sleep 0.005
}
}
return true
#Follow redirects
when Net::HTTPRedirection
return download_file(res.header['location'] ,dest_path, limit - 1)
else
response.error!
return false
end
}
}
rescue
puts "Failed to reach #{url}, #{$!.to_s}"
end
def download(url, dest_path, upload_date)
return false if File.exists?(dest_path)
FileUtils.mkdir_p(File.dirname(dest_path)) if not Dir.exists?(File.dirname(dest_path))
if download_file(url,dest_path)
File.utime(upload_date,upload_date,dest_path)
end
return true
end
def make_relative(path)
here = File.expand_path('.', ::File.dirname(__FILE__))
return (path.start_with?(here)) ? path[here.length..-1] : path
end
def strip_domain(url)
uri = URI.parse(url)
return uri.path + ((uri.query.nil? || uri.query.empty?) ? "" : "?#{uri.query}")
end
def get_domain(url)
uri = URI.parse(url)
port = uri.port.nil? ? "" : ":#{uri.port}"
return "http://#{uri.host}#{port}"
end
def get_alternate_urls(item, metadata, offline=false)
#Build an array of the 4 data sources: wp:post_id, link, guid, and metadata 'url'
post_id = item.at('wp:post_id').inner_text
urls = [item.at('link').inner_text,
item.at('guid').inner_text,
get_domain(item.at('link').inner_text) + "/?p=#{post_id}"]
meta_url = metadata["url"]
urls.push('/' + meta_url.gsub(/^\//,"")) if not meta_url.nil?
#Cleanse array
urls = urls.uniq.reject{|i| i.nil? || i.empty?}
#Use HTTP requests to capture any redirections.
short_urls = []
urls.each{|v|
if offline
short_urls.push(strip_domain(v))
else
get_redirections(v).each{|url|
short_urls.push(strip_domain(url))
}
end
}
#strip domains, duplicates, and remove empty values
return short_urls.uniq.reject{|i| i.nil? || i.empty?}
end
def self.process(offline=false, filename = "wordpress.xml",drop_metadata=["Author","wp_date","wp_template"])
WordpressImport.new.import(offline, filename,drop_metadata)
end
def import(offline=false, filename = "wordpress.xml", drop_metadata=["Author","wp_date","wp_template"])
import_count = Hash.new(0)
doc = Hpricot::XML(File.read(filename))
## Where to store the attachements. We'll need a URL rewriting rule to change '/wp-content' -> '/attachments/wp-content'
attachment_dir = File.expand_path('content/attachments', ::File.dirname(__FILE__))
# Where to store posts and pages
content_dir = File.expand_path('content/pages', ::File.dirname(__FILE__))
#A hash to detect duplicate URLs in the XML
items_by_urls = {}
authors = {}
#Build hash of login->display name for authors
(doc/:channel).first.get_elements_by_tag_name('wp:author').each do |author|
author_login = author.at('wp:author_login').inner_text.strip
author_name = author.at('wp:author_display_name').inner_text.strip
authors[author_login] = author_name
puts "Author #{author_login} will be mapped to #{author_name}"
end
(doc/:channel/:item).each do |item|
#Get the item title
title = item.at(:title).inner_text.strip
#Get post_id
post_id = item.at('wp:post_id').inner_text
puts "Importing #{post_id} - #{title}"
#Item type: post, page, or attachment
type = item.at('wp:post_type').inner_text
#GMT posted date - always available for attachements, but not always for posts/pages that have never been published.
#Fall back to post_date when we get an ArgumentError - will be off by an unknown timezone, but date should be correct.
post_date_gmt = Time.parse(item.at('wp:post_date_gmt').inner_text) rescue Time.parse(item.at('wp:post_date').inner_text)
#Download attachments unless they already exist
if type == "attachment"
a_url = item.at('wp:attachment_url').inner_text
if offline
puts "(Offline) Skipping #{a_url}"
else
download(a_url, attachment_dir + strip_domain(a_url), post_date_gmt)
end
elsif
#Parse metadata into a hash
metas = Hash[item.search("wp:postmeta").map{ |meta| [meta.at('wp:meta_key').inner_text, meta.at('wp:meta_value').inner_text]}]
#The template used by wordpress. Can be used to allow regex migrations to Nesta templates
wp_template = metas["_wp_page_template"]
#Discard all meta keys starting with '_', and all empty meta values
metas.reject!{|k,v|k[0] == "_" || v.empty?}
#Parse tags and categories
tags = (item/:category).reject{|c| c.attributes['domain'] != 'post_tag'}.map{|c| c.attributes['nicename']}.uniq
categories = (item/:category).reject{|c| c.attributes['domain'] != 'category'}.map{|c| c.attributes['nicename']}.reject{|c| c == 'uncategorized'}.uniq
#Calculate the status of the page or post: publish, draft, pending, private, or protected
status = item.at('wp:status').inner_text
status = "protected" if not item.at('wp:post_password').inner_text.empty?
is_public = status == "publish"
#Get the slug, fallback to normalized title, then fallback to post ID.
post_name = item.at('wp:post_name').inner_text
post_name = title.downcase.split.join('-') if post_name.empty?
post_name = post_id if post_name.empty?
#Sanitize
post_name = post_name.gsub(/[-]?[^A-Za-z0-9-]+[-]?/,"-").gsub(/^[-]+/,"").gsub(/[-]+$/,"")
puts "\n\n\n#{post_name}\n\n\n" if post_name.include?("/")
#Calculate the location for the .mdown file
link_uri = URI.parse(item.at('link').inner_text)
old_path_query = strip_domain(item.at('link').inner_text)
#The path (no domain, no port, no querystring) of the item's active URL. No trailing or leading slashes
new_path = link_uri.path.gsub(/^\//,"").gsub(/\/$/,"")
if type == "page"
#Un-named pages go in the drafts folder, regardless of their status.
#Named, but status=draft, private, protected, pending posts simply get flagged, not renamed
if new_path.empty?;
new_path = "drafts/#{post_name}"
elsif !is_public
puts "Page #{new_path} has a status of #{status}. Please review file."
end
elsif type == "post"
#Only public articles go into /blog/
if is_public
new_path = "blog/#{post_date_gmt.year}/#{post_name}"
puts "Article #{old_path_query} was placed at #{new_path}"
else
new_path = "drafts/#{post_name}"
puts "(#{status} article #{old_path_query} was placed at #{new_path}"
end
end
short_new_path = new_path
#Add dir and extension
new_path = "#{content_dir}/#{new_path}.mdown"
#Acquire a list of all the URLs that may have been used to link to this post or page, so we can add redirects later
#Exclude any duplicates with previous files - first come, first serve.
alternate_urls = get_alternate_urls(item,metas, offline).reject{|u|
if items_by_urls.has_key?(u)
puts "Duplicate URL '#{u}' used by more than one item - will not be added"
puts "Current: #{short_new_path}, First item: #{items_by_urls[u]}"
puts ""
true
else
items_by_urls[u] = short_new_path
false
end
}
#Convert post_id to an int unless its a string (to avoid the quotes)
post_id = post_id.to_i if post_id.match(/^\d+$/)
#Generate metadata table for new file
metadata = {
"Alternate Urls" => alternate_urls * " ",
"Atom ID" => item.at('guid').inner_text.strip,
"Post ID" => post_id,
"Author" => authors[item.at('dc:creator').inner_text.strip],
"wp_date" => post_date_gmt.to_s,
"wp_template" => wp_template,
"Summary" => item.at('excerpt:encoded').inner_text.strip,
"Status" => status == "publish" ? "" : status,
"Categories" => categories * ", ",
"Tags" => tags * ", "
}
#Dont' add any values that are empty
metadata.reject!{|k,v| v.nil? or v.to_s.empty?}
#Articles/posts (not pages) get a 'Date' value - this is what Nesta uses to differentiate them
metadata["Date"] = post_date_gmt.strftime("%b %e %Y").gsub(" "," ") if type == "post"
#Make sure metadata uses string keys
#metadata = metadata.inject({}){|memo,(k,v)| memo[k.to_s] = v; memo}
#drop the excluded metadata
metadata.reject!{|k,v| drop_metadata.include?(k)}
#Create file
FileUtils.mkdir_p File.dirname(new_path) if not Dir.exists?(File.dirname(new_path))
File.open(new_path, "w") do |f|
f.puts metadata.to_yaml.gsub(/^---\s*/,"") #Strip leading dashes
f.puts "\n##{title}\n\n"
f.puts item.at('content:encoded').inner_text
f.puts "\n"
metas.each { |key, value|
f.puts "<!--#{key.gsub(/-/,"&#45;")}: #{value.gsub(/-/,"&#45;")}-->\n"
}
end
end
import_count[type] += 1
end
import_count.each do |key, value|
puts "Imported #{value} #{key}s"
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment