Created
May 27, 2013 10:20
-
-
Save lukeholder/5656358 to your computer and use it in GitHub Desktop.
ruby script to get at a wordpress export and put it into a sqlite db
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env ruby | |
# encoding: utf-8 | |
require 'rubygems' | |
require 'bundler/setup' | |
# require your gems as usual | |
require 'sequel' | |
require 'nokogiri' | |
require 'time' | |
require 'active_support/core_ext/string' | |
# require 'reverse_markdown' | |
def simple_format(text) | |
text = '' if text.nil? | |
start_tag = "<p>" | |
text = text.to_str | |
text.gsub!(/\r\n?/, "\n") # \r\n and \r -> \n | |
text.gsub!(/\n\n+/, "</p>\n\n#{start_tag}") # 2+ newline -> paragraph | |
text.gsub!(/([^>])(\n)([^\n<])/, '\1<br>\2\3') | |
text.insert 0, start_tag | |
text.concat("</p>") | |
end | |
# connect to an in-memory database | |
DB = Sequel.sqlite('recipes.db') | |
DB.run("DROP TABLE 'recipes'") | |
puts "Creating recipes table" | |
DB.create_table :recipes do | |
primary_key :id | |
String :title | |
Fixnum :title_length | |
String :tags, :text => true | |
String :raw_body, :text => true | |
String :status | |
DateTime :date_time | |
String :old_url_a | |
String :old_url_b | |
String :slug | |
end | |
# create an items table | |
recipes = DB[:recipes] | |
f = File.open("tenina.wordpress.2013-05-27.xml") | |
items = Nokogiri::XML(f).xpath("//channel//item") | |
items.each do |item| | |
post_id = item.at_xpath('wp:post_id').text.to_i | |
title = item.at_xpath('title').text.to_s.titleize | |
categories = (item/"category[@domain=category]").map{|c| c.inner_text.singularize.titleize}.reject{|c| c == 'Uncategorized'}.uniq | |
tags = (item/"category[@domain=post_tag]").map{|t| t.inner_text.singularize.titleize}.uniq | |
tags << categories | |
tags = tags.uniq | |
tags = tags.join(',') | |
type = item.at_xpath('wp:post_type').text.singularize.titleize | |
status = item.at_xpath('wp:status').text.singularize.titleize | |
body = simple_format(item.at_xpath("content:encoded").text) | |
date = item.at_xpath('wp:post_date').text | |
old_url_a = item.at_xpath('link').text.to_s | |
old_url_b = item.at_xpath('guid').text.to_s | |
recipes.insert( | |
:id => post_id, | |
:title => title, | |
:status => status, | |
:raw_body => body, | |
:title_length => title.length, | |
:tags => tags, | |
:date_time => date, | |
:old_url_a => old_url_a, | |
:old_url_b => old_url_b, | |
:slug | |
) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment