|
#!/usr/bin/ruby |
|
|
|
require 'Nokogiri' |
|
require 'net/http' |
|
require 'securerandom' |
|
require 'FileUtils' |
|
require 'date' |
|
|
|
class Post |
|
attr_accessor :title |
|
attr_accessor :raw_content |
|
attr_accessor :pub_date |
|
attr_accessor :link |
|
|
|
def initialize(node) |
|
@title = node.xpath("title").text |
|
@raw_content = node.xpath("content:encoded").text |
|
@pub_date = node.xpath("pubDate").text |
|
@link = node.xpath("link").text |
|
end |
|
|
|
def generate_markdown |
|
content_node = Nokogiri::HTML("<import>#{ @raw_content }</import>") |
|
|
|
# Find all images on the Squarespace CDN |
|
images = content_node.xpath("//img").select { |image| image["src"] and image["src"].include? "squarespace.com" } |
|
|
|
images.each { |image| |
|
# download them and put them in the correct folder |
|
|
|
url = URI.parse(image["src"]) |
|
response = Net::HTTP.get_response url |
|
content_type = response["Content-Type"] |
|
|
|
extension = "" |
|
if content_type.include? "image/jpg" || content_type.include? "image/jpeg" |
|
extension = "jpg" |
|
elsif content_type.include? "image/gif" |
|
extension = "gif" |
|
elsif content_type.include? "image/png" |
|
extension = "png" |
|
else |
|
puts "Warning – unknown file (defaulting to jpeg): #{ image["src"] }" |
|
extension = "jpeg" |
|
end |
|
|
|
directory = "img/import#{ link }/" |
|
filename = "#{ directory }#{ SecureRandom.uuid.gsub('-','').upcase }.#{ extension }" |
|
|
|
FileUtils.mkdir_p directory |
|
IO.write(filename, response.body) |
|
|
|
puts "Wrote #{ filename } to disk" |
|
|
|
# Change the content_node's img children to point to their new files |
|
|
|
image["src"] = "/#{ filename }" |
|
image["class"] = "img-responsive" |
|
} |
|
|
|
# Fix all the iframes |
|
iframes = content_node.xpath("//iframe") |
|
iframes.wrap("<div class='embed-responsive embed-responsive-16by9'></div>") |
|
iframes.each { |iframe| iframe["class"] = "embed-responsive-item" } |
|
|
|
# Remove Instagram ickiness |
|
|
|
content_node.xpath("//div[contains(@class,\"instagram-oembed\")]/p").each { |node| node.remove } |
|
|
|
# Finally, generate the markdown file from the content_node |
|
|
|
date = Date.parse(@pub_date) |
|
|
|
directory = "blog" |
|
FileUtils.mkdir_p directory |
|
filename = "#{ directory }/#{ date.strftime('%Y-%m-%d') }-#{ @title.gsub(' ', '-').gsub(/['"?\/:]/, '').downcase }.markdown" |
|
|
|
body = <<-eos |
|
--- |
|
title: "#{ @title}" |
|
date: #{ date.strftime('%Y-%m-%d %H:%M') } |
|
--- |
|
|
|
#{ content_node.xpath("//import").first } |
|
|
|
<!-- more --> |
|
|
|
eos |
|
|
|
IO.write(filename, body) |
|
|
|
puts "Wrote #{ filename }" |
|
end |
|
|
|
def to_s |
|
"#{ @title } published on #{ pub_date }" |
|
end |
|
end |
|
|
|
filename = ARGV.first |
|
abort "Usage: ./script path_of_xml_file" unless filename |
|
abort "File does not exist" unless File.exist?(filename) |
|
|
|
file = File.open(filename) |
|
doc = Nokogiri::XML(file) |
|
file.close |
|
|
|
puts "Opened XML file at " + filename |
|
|
|
post_nodes = doc.xpath("//item").select { | item | item.xpath("wp:post_type/text()").text == "post" && item.xpath("wp:status/text()").text == "publish" } |
|
|
|
posts = post_nodes.map { |node| Post.new(node) } |
|
|
|
# puts posts[0].generate_markdown |
|
posts.each { |post| post.generate_markdown } |