Created
September 16, 2016 11:03
-
-
Save matiaskorhonen/f0a7685d9870cf2a3548325c602ff237 to your computer and use it in GitHub Desktop.
Tumblr blog post extraction code. Pretty much tailored for my needs when I migrated from Tumblr to Middleman, but this might still be of use to someone…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require "tumblr_client" | |
require "time" | |
require "yaml" | |
require "fileutils" | |
require "open-uri" | |
require "nokogiri" | |
require "loofah" | |
require "reverse_markdown" | |
require "kramdown" | |
puts "ReverseMarkdown config" | |
ReverseMarkdown.config do |config| | |
config.unknown_tags = :pass_through | |
config.github_flavored = true | |
end | |
puts "Tumblr config" | |
Tumblr.configure do |config| | |
config.consumer_key = "..." | |
config.consumer_secret = "..." | |
config.oauth_token = "..." | |
config.oauth_token_secret = "..." | |
end | |
client = Tumblr::Client.new | |
user_info = client.info | |
per_page = 20 | |
puts "Fetching posts" | |
data = client.posts("matiaskorhonen.tumblr.com", offset: 0, limit: per_page) | |
total_posts = data["total_posts"] | |
puts "Total posts: #{total_posts}" | |
pages = (total_posts.to_f/per_page).ceil | |
posts = data["posts"] | |
(pages - 1).times do |page| | |
data = client.posts("matiaskorhonen.tumblr.com", offset: (page + 1) * per_page, limit: per_page) | |
posts.push(*data["posts"]) | |
end | |
puts "Processing posts" | |
posts_count = posts.size | |
posts.each_with_index do |post, index| | |
puts "=> #{index + 1}/#{posts_count}: #{post["slug"]}" | |
date = Time.parse(post["date"]) | |
basename = "#{date.strftime("%Y-%m-%d")}_#{post["slug"]}" | |
filename = "#{basename}.html.md" | |
body = "" | |
type = post["type"] | |
post_path = post["post_url"].gsub("http://matiaskorhonen.tumblr.com/", "") + "/" | |
short_path = post_path.gsub(post["slug"] + "/", "") | |
aliases = [post_path, short_path] | |
if type == "photo" | |
aliases << short_path.gsub("post", "image") | |
end | |
frontmatter ={ | |
"tumblr_id" => post["id"], | |
"alias" => aliases, | |
"date" => date, | |
"tags" => post["tags"], | |
"type" => type | |
} | |
case type | |
when "text" | |
frontmatter["title"] = post["title"] | |
body = post["body"] | |
when "photo" | |
frontmatter["source_url"] = post["source_url"] if post["source_url"] | |
frontmatter["source_title"] = post["source_title"] if post["source_title"] | |
begin | |
puts " -> Fetching image" | |
photo_url = post["photos"].first["original_size"]["url"] | |
puts " #{photo_url}" | |
extension = File.extname(photo_url.split("/").last) | |
photo_name = File.basename(photo_url.split("/").last).split(".").first | |
photo_file = "#{photo_name}#{extension}" | |
photo_path = "#{basename}/#{photo_file}" | |
# unless File.exists? photo_path | |
# photo = open(photo_url) | |
# | |
# FileUtils.mkdir_p(basename) | |
# File.open photo_path, "wb+" do |f| | |
# f.write photo.read | |
# end | |
# end | |
frontmatter["photo"] = photo_file | |
body = "#{post["caption"]}\n" | |
rescue | |
puts post["photos"].inspect | |
raise | |
end | |
when "quote" | |
body = post["text"] | |
frontmatter["source"] = post["source"] | |
when "video" | |
body = post["caption"] | |
frontmatter["video_url"] = post["permalink_url"] | |
when "link" | |
frontmatter["title"] = post["title"] | |
frontmatter["link_url"] = post["url"] | |
body = post["description"] | |
else | |
raise "Unknown type: #{type}" | |
end | |
image_scrubber = Loofah::Scrubber.new do |node| | |
if node.name == "img" | |
puts " -> Fetching image" | |
photo_url = node["src"] | |
puts " #{photo_url}" | |
photo = open(photo_url) | |
extension = File.extname(photo_url.split("/").last) | |
photo_name = File.basename(photo_url.split("/").last).split(".").first | |
photo_path = "#{basename}/#{photo_name}#{extension}" | |
FileUtils.mkdir_p(basename) | |
File.open photo_path, "wb+" do |f| | |
f.write photo.read | |
end | |
node["src"] = "articles/#{photo_path}" | |
end | |
end | |
doc = Loofah.fragment(body).scrub!(image_scrubber) | |
body = ReverseMarkdown.convert doc.to_s | |
body.gsub!('\_', "_") | |
if frontmatter["title"].nil? | |
title_md = body.split("\n").first | |
title_html = Kramdown::Document.new(title_md).to_html | |
frontmatter["title"] = Nokogiri::HTML::DocumentFragment.parse(title_html).text | |
frontmatter["title"].gsub!("\n", " ") | |
frontmatter["title"].strip! | |
puts " -> Generated title: “#{frontmatter["title"]}”" | |
end | |
File.open filename, "w+" do |f| | |
f.write YAML.dump(frontmatter) | |
f.write "---\n\n" | |
f.write body | |
f.write "\n" | |
end | |
end | |
puts "Done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment