Created
June 2, 2015 17:24
-
-
Save rposborne/77bab389321d3b9fe490 to your computer and use it in GitHub Desktop.
Repair Posterous Wordpress Export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'nokogiri' | |
require 'pry' | |
require 'time' | |
# Get all Images that have already been imported to wordpress + thumbs | |
working_dir = '/Volumes/datastore_documents/Rlens Blog/' | |
xml_file = 'wordpress_export_1 2.xml' | |
def get_new_path(filename, size: '1024') | |
@images ||= Dir.glob(working_dir + 'wp_images/**/*') | |
images_and_thumbs = @images.select do |file| file.include?(File.basename filename, '.*') } | |
system_path = images_and_thumbs.detect { |f| f.include?(size) } | |
if system_path | |
update_path system_path | |
# fall back to full res if size that we want is not present | |
elsif system_path.nil? && images_and_thumbs | |
update_path images_and_thumbs.detect { |f| f.include?(filename) } | |
else | |
puts "#{filename} could not be found" | |
end | |
end | |
def update_path(path) | |
path.gsub(working_dir + 'wp_images/', '/wp-content/uploads/') | |
end | |
def xml_time(time_string) | |
Time.parse(time_string).strftime '%Y-%m-%dT%H:%M:%S%z' | |
end | |
f = File.open(working_dir + xml_file) | |
doc = Nokogiri::XML(f) | |
publish_dates = doc.css('pubDate') | |
publish_dates.each do |date_el| | |
date_el.content = xml_time(date_el.text) | |
end | |
doc.css('item').each do |item| | |
# Correct Images and links | |
content = Nokogiri::HTML item.at_xpath('content:encoded').content | |
# These should be cropped to 1024 | |
content.css('a').each do |el| | |
filename_to_lookup = (File.basename el.attribute('href').value).rpartition('.scaled')[0] | |
el.attribute('href').value = get_new_path(filename_to_lookup, size: '1024').to_s | |
end | |
# These should be cropped to 510 by | |
content.css('img').each do |el| | |
filename_to_lookup = (File.basename el.attribute('src').value).rpartition('.scaled')[0] | |
el.attribute('src').value = get_new_path(filename_to_lookup, size: '510').to_s | |
end | |
item.at_xpath('content:encoded').content = content.to_html | |
# Correct Dates | |
post_date = xml_time(item.at_xpath('wp:post_date').text) | |
post_date_gmt = xml_time(item.at_xpath('wp:post_date_gmt').text) | |
item.at_xpath('wp:post_date').content = post_date | |
item.at_xpath('wp:post_date_gmt').content = post_date_gmt | |
end | |
File.open('output.xml', 'w') { |o| o.print(doc.to_xml) } | |
f.close |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Correct Timestamps and update images and links to images with a quick script from posterous wordpress export data. | |
Figured might be useful for someone. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment