Skip to content

Instantly share code, notes, and snippets.

@matiaskorhonen
Created September 16, 2016 11:03
Show Gist options
  • Save matiaskorhonen/f0a7685d9870cf2a3548325c602ff237 to your computer and use it in GitHub Desktop.
Save matiaskorhonen/f0a7685d9870cf2a3548325c602ff237 to your computer and use it in GitHub Desktop.
Tumblr blog post extraction code. Pretty much tailored for my needs when I migrated from Tumblr to Middleman, but this might still be of use to someone…
#!/usr/bin/env ruby
require "tumblr_client"
require "time"
require "yaml"
require "fileutils"
require "open-uri"
require "nokogiri"
require "loofah"
require "reverse_markdown"
require "kramdown"
puts "ReverseMarkdown config"
ReverseMarkdown.config do |config|
config.unknown_tags = :pass_through
config.github_flavored = true
end
puts "Tumblr config"
Tumblr.configure do |config|
config.consumer_key = "..."
config.consumer_secret = "..."
config.oauth_token = "..."
config.oauth_token_secret = "..."
end
client = Tumblr::Client.new
user_info = client.info
per_page = 20
puts "Fetching posts"
data = client.posts("matiaskorhonen.tumblr.com", offset: 0, limit: per_page)
total_posts = data["total_posts"]
puts "Total posts: #{total_posts}"
pages = (total_posts.to_f/per_page).ceil
posts = data["posts"]
(pages - 1).times do |page|
data = client.posts("matiaskorhonen.tumblr.com", offset: (page + 1) * per_page, limit: per_page)
posts.push(*data["posts"])
end
puts "Processing posts"
posts_count = posts.size
posts.each_with_index do |post, index|
puts "=> #{index + 1}/#{posts_count}: #{post["slug"]}"
date = Time.parse(post["date"])
basename = "#{date.strftime("%Y-%m-%d")}_#{post["slug"]}"
filename = "#{basename}.html.md"
body = ""
type = post["type"]
post_path = post["post_url"].gsub("http://matiaskorhonen.tumblr.com/", "") + "/"
short_path = post_path.gsub(post["slug"] + "/", "")
aliases = [post_path, short_path]
if type == "photo"
aliases << short_path.gsub("post", "image")
end
frontmatter ={
"tumblr_id" => post["id"],
"alias" => aliases,
"date" => date,
"tags" => post["tags"],
"type" => type
}
case type
when "text"
frontmatter["title"] = post["title"]
body = post["body"]
when "photo"
frontmatter["source_url"] = post["source_url"] if post["source_url"]
frontmatter["source_title"] = post["source_title"] if post["source_title"]
begin
puts " -> Fetching image"
photo_url = post["photos"].first["original_size"]["url"]
puts " #{photo_url}"
extension = File.extname(photo_url.split("/").last)
photo_name = File.basename(photo_url.split("/").last).split(".").first
photo_file = "#{photo_name}#{extension}"
photo_path = "#{basename}/#{photo_file}"
# unless File.exists? photo_path
# photo = open(photo_url)
#
# FileUtils.mkdir_p(basename)
# File.open photo_path, "wb+" do |f|
# f.write photo.read
# end
# end
frontmatter["photo"] = photo_file
body = "#{post["caption"]}\n"
rescue
puts post["photos"].inspect
raise
end
when "quote"
body = post["text"]
frontmatter["source"] = post["source"]
when "video"
body = post["caption"]
frontmatter["video_url"] = post["permalink_url"]
when "link"
frontmatter["title"] = post["title"]
frontmatter["link_url"] = post["url"]
body = post["description"]
else
raise "Unknown type: #{type}"
end
image_scrubber = Loofah::Scrubber.new do |node|
if node.name == "img"
puts " -> Fetching image"
photo_url = node["src"]
puts " #{photo_url}"
photo = open(photo_url)
extension = File.extname(photo_url.split("/").last)
photo_name = File.basename(photo_url.split("/").last).split(".").first
photo_path = "#{basename}/#{photo_name}#{extension}"
FileUtils.mkdir_p(basename)
File.open photo_path, "wb+" do |f|
f.write photo.read
end
node["src"] = "articles/#{photo_path}"
end
end
doc = Loofah.fragment(body).scrub!(image_scrubber)
body = ReverseMarkdown.convert doc.to_s
body.gsub!('\_', "_")
if frontmatter["title"].nil?
title_md = body.split("\n").first
title_html = Kramdown::Document.new(title_md).to_html
frontmatter["title"] = Nokogiri::HTML::DocumentFragment.parse(title_html).text
frontmatter["title"].gsub!("\n", " ")
frontmatter["title"].strip!
puts " -> Generated title: “#{frontmatter["title"]}”"
end
File.open filename, "w+" do |f|
f.write YAML.dump(frontmatter)
f.write "---\n\n"
f.write body
f.write "\n"
end
end
puts "Done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment