rposborne · June 2, 2015 17:24
diff --git a/fix_export.rb b/fix_export.rb
 #!/usr/bin/env ruby
 require 'nokogiri'
 require 'pry'
 require 'time'

 # Get all Images that have already been imported to wordpress + thumbs

 working_dir = '/Volumes/datastore_documents/Rlens Blog/'
 xml_file    = 'wordpress_export_1 2.xml'

 def get_new_path(filename, size: '1024')
  @images ||= Dir.glob(working_dir + 'wp_images/**/*')
  images_and_thumbs = @images.select do |file| file.include?(File.basename filename, '.*')  }
  system_path = images_and_thumbs.detect { |f| f.include?(size) }
  if system_path
    update_path system_path
    # fall back to full res if size that we want is not present
  elsif system_path.nil? && images_and_thumbs
    update_path images_and_thumbs.detect { |f| f.include?(filename) }
  else
    puts "#{filename} could not be found"
  end
 end

 def update_path(path)
  path.gsub(working_dir + 'wp_images/', '/wp-content/uploads/')
 end

 def xml_time(time_string)
  Time.parse(time_string).strftime '%Y-%m-%dT%H:%M:%S%z'
 end

 f = File.open(working_dir + xml_file)
 doc = Nokogiri::XML(f)
 publish_dates = doc.css('pubDate')
 publish_dates.each do |date_el|
  date_el.content = xml_time(date_el.text)
 end

 doc.css('item').each do |item|
  # Correct Images and links
  content  = Nokogiri::HTML item.at_xpath('content:encoded').content

  # These should be cropped to 1024
  content.css('a').each do |el|
    filename_to_lookup = (File.basename el.attribute('href').value).rpartition('.scaled')[0]
    el.attribute('href').value = get_new_path(filename_to_lookup, size: '1024').to_s
  end

  # These should be cropped to 510 by
  content.css('img').each do |el|
    filename_to_lookup = (File.basename el.attribute('src').value).rpartition('.scaled')[0]
    el.attribute('src').value = get_new_path(filename_to_lookup, size: '510').to_s
  end

  item.at_xpath('content:encoded').content = content.to_html
  # Correct Dates

  post_date = xml_time(item.at_xpath('wp:post_date').text)
  post_date_gmt = xml_time(item.at_xpath('wp:post_date_gmt').text)

  item.at_xpath('wp:post_date').content = post_date
  item.at_xpath('wp:post_date_gmt').content = post_date_gmt
 end
 File.open('output.xml', 'w') { |o| o.print(doc.to_xml) }

 f.close
diff --git a/Repair Posterous b/Repair Posterous
 Correct Timestamps and update images and links to images with a quick script from posterous wordpress export data.  

 Figured might be useful for someone.
	#!/usr/bin/env ruby
	require 'nokogiri'
	require 'pry'
	require 'time'

	# Get all Images that have already been imported to wordpress + thumbs

	working_dir = '/Volumes/datastore_documents/Rlens Blog/'
	xml_file = 'wordpress_export_1 2.xml'

	def get_new_path(filename, size: '1024')
	@images \|\|= Dir.glob(working_dir + 'wp_images/*/')
	images_and_thumbs = @images.select do \|file\| file.include?(File.basename filename, '.*') }
	system_path = images_and_thumbs.detect { \|f\| f.include?(size) }
	if system_path
	update_path system_path
	# fall back to full res if size that we want is not present
	elsif system_path.nil? && images_and_thumbs
	update_path images_and_thumbs.detect { \|f\| f.include?(filename) }
	else
	puts "#{filename} could not be found"
	end
	end

	def update_path(path)
	path.gsub(working_dir + 'wp_images/', '/wp-content/uploads/')
	end

	def xml_time(time_string)
	Time.parse(time_string).strftime '%Y-%m-%dT%H:%M:%S%z'
	end

	f = File.open(working_dir + xml_file)
	doc = Nokogiri::XML(f)
	publish_dates = doc.css('pubDate')
	publish_dates.each do \|date_el\|
	date_el.content = xml_time(date_el.text)
	end

	doc.css('item').each do \|item\|
	# Correct Images and links
	content = Nokogiri::HTML item.at_xpath('content:encoded').content

	# These should be cropped to 1024
	content.css('a').each do \|el\|
	filename_to_lookup = (File.basename el.attribute('href').value).rpartition('.scaled')[0]
	el.attribute('href').value = get_new_path(filename_to_lookup, size: '1024').to_s
	end

	# These should be cropped to 510 by
	content.css('img').each do \|el\|
	filename_to_lookup = (File.basename el.attribute('src').value).rpartition('.scaled')[0]
	el.attribute('src').value = get_new_path(filename_to_lookup, size: '510').to_s
	end

	item.at_xpath('content:encoded').content = content.to_html
	# Correct Dates

	post_date = xml_time(item.at_xpath('wp:post_date').text)
	post_date_gmt = xml_time(item.at_xpath('wp:post_date_gmt').text)

	item.at_xpath('wp:post_date').content = post_date
	item.at_xpath('wp:post_date_gmt').content = post_date_gmt
	end
	File.open('output.xml', 'w') { \|o\| o.print(doc.to_xml) }

	f.close
	Correct Timestamps and update images and links to images with a quick script from posterous wordpress export data.

	Figured might be useful for someone.