-
-
Save spiffytech/e73777e167dc5a8b6a87 to your computer and use it in GitHub Desktop.
Imports a Squarespace dump into Jekyll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'rubygems' | |
require 'hpricot' | |
require 'nokogiri' | |
require 'fileutils' | |
require 'safe_yaml' | |
require 'time' | |
require 'uri' | |
require 'open-uri' | |
module JekyllImport | |
# This importer takes a wordpress.xml file, which can be exported from your | |
# wordpress.com blog (/wp-admin/export.php). | |
module WordpressDotCom | |
attr :image_folder | |
@image_folder = 'squarespace_images' | |
def self.download_image(src, dest) | |
return if ::File.exist? dest # Speed-up for when importing multiple times | |
File.open(dest, "wb") do |saved_file| | |
# the following "open" is provided by open-uri | |
open(src, "rb") do |read_file| | |
saved_file.write(read_file.read) | |
end | |
end | |
end | |
def self.process(filename = {:source => "_wordpress.xml"}) | |
Dir.mkdir @image_folder unless ::File.exist? @image_folder | |
import_count = Hash.new(0) | |
doc = Hpricot::XML(File.read(filename[:source])) | |
(doc/:channel/:item).each do |item| | |
title = item.at(:title).inner_text.strip | |
permalink_title = item.at('wp:post_name').inner_text.gsub("/","-") | |
# Fallback to "prettified" title if post_name is empty (can happen) | |
if permalink_title == "" | |
permalink_title = sluggify(title) | |
end | |
if item.at('wp:post_date') | |
begin | |
date = Time.parse(item.at('wp:post_date').inner_text) | |
rescue | |
date = Time.now | |
end | |
else | |
date = Time.now | |
end | |
status = item.at('wp:status').inner_text | |
if status == "publish" | |
published = true | |
else | |
published = false | |
end | |
type = item.at('wp:post_type').inner_text | |
categories = item.search('category[@domain="category"]').map{|c| c.inner_text}.reject{|c| c == 'Uncategorized'}.uniq | |
tags = item.search('category[@domain="post_tag"]').map{|t| t.inner_text}.uniq | |
metas = Hash.new | |
item.search("wp:postmeta").each do |meta| | |
key = meta.at('wp:meta_key').inner_text | |
value = meta.at('wp:meta_value').inner_text | |
metas[key] = value; | |
end | |
# Identify Squarespace-hosted images, download them, and update the | |
# URLs to point to our copies | |
body = item.at('content:encoded').inner_text | |
body = body.gsub(/\[\/?caption[^]]*\]/, '') # Remove caption blocks which don't render properly | |
doc = Nokogiri::HTML(body) | |
doc.css('img').each do |element| | |
puts element | |
src = element['src'] | |
u = URI src | |
if u.host.end_with? 'squarespace.com' | |
filename = u.path.sub(/^\//, '').gsub('/', '_') + '_' + (u.fragment || '') | |
dest = ::File.join(@image_folder, filename) | |
download_image(src, dest) | |
element['src'] = '/' + dest | |
end | |
end | |
body = doc.to_s | |
name = "#{date.strftime('%Y-%m-%d')}-#{permalink_title}.html" | |
header = { | |
'layout' => type, | |
'title' => title, | |
'categories' => categories, | |
'tags' => tags, | |
'status' => status, | |
'type' => type, | |
'published' => published, | |
'meta' => metas | |
} | |
begin | |
FileUtils.mkdir_p "_#{type}s" | |
File.open("_#{type}s/#{name}", "w") do |f| | |
f.puts header.to_yaml | |
f.puts '---' | |
f.puts body | |
end | |
rescue => e | |
puts "Couldn't import post!" | |
puts "Title: #{title}" | |
puts "Name/Slug: #{name}\n" | |
puts "Error: #{e.message}" | |
next | |
end | |
import_count[type] += 1 | |
end | |
import_count.each do |key, value| | |
puts "Imported #{value} #{key}s" | |
end | |
end | |
def self.sluggify(title) | |
title.gsub(/[^[:alnum:]]+/, '-').downcase | |
end | |
end | |
end | |
JekyllImport::WordpressDotCom.process |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'html2markdown' | |
POST_REGEX = %r{(?<year>[0-9]+)-(?<month>[0-9]+)-(?<day>[0-9]+)-(?<title>.*).html} | |
files = Dir.glob('*.html').select{ |f| f.match POST_REGEX } | |
files.each do |post| | |
data = post.match(POST_REGEX) | |
p = HTMLPage.new(contents: File.read(post)) | |
File.open(post, 'w') { |f| f.puts p.markdown } | |
File.rename(post, "#{data[:year]}-#{data[:month]}-#{data[:day]}-#{data[:title]}.md") | |
end |
my Squarespace images files all ended with .ext_ (.jpeg_, .png_, etc.)
Also, made adjustments mentioned by @lpattori and it worked well.
Imported 106 pages
Imported 487 posts
Imported 70 attachments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks a lot! I really needed to recover the images.
To make it work I had to replace open wit URI.open in line 23 and 'squarespace.com' with 'squarespace-cdn.com' in line 81.