Last active
November 5, 2019 14:15
-
-
Save robfletcher/5252ae747653bdef99bf to your computer and use it in GitHub Desktop.
Scrape markdown posts from a Tumblr blog and export for Middleman
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This script will scrape text posts from a Tumblr blog and export them as `.md` | |
* files with YAML frontmatter ready to be used in a Middleman blog. | |
* | |
* Aliases are added for the _middleman-alias_ gem so Tumblr style URLs like | |
* `/post/{id}/{slug}` will redirect to the Middleman-style URL. | |
* | |
* Although I built this for migrating to Middleman it should be pretty easy to | |
* adapt this for any similar static site generator. | |
*/ | |
import groovy.json.* | |
// customize these values | |
final targetDir = // directory where you want the downloaded files | |
final blog = // your tumblr blog, e.g. foo.tumblr.com | |
final apiKey = // your tumblr API key | |
final pageSize = 20 | |
final postsUrl = "http://api.tumblr.com/v2/blog/${blog}/posts/text?api_key=${apiKey}&filter=raw" | |
def offset = 0 | |
def totalPosts = Integer.MAX_VALUE | |
while(offset <= totalPosts) { | |
println "fetching posts $offset..${offset + pageSize} of $totalPosts..." | |
def url = new URL(postsUrl + "&offset=$offset") | |
def json = new JsonSlurper().parse(url) | |
json.response.posts.each { post -> | |
def timestamp = new Date(post.timestamp * 1000L) | |
def filename = "${timestamp.format('yyyy-MM-dd')}-${post.slug}.md" | |
println "writing $filename..." | |
new File(targetDir, filename).withWriter { writer -> | |
writer << """\ | |
--- | |
title: '${post.title.replace("'", "''")}' | |
date: ${timestamp.format("yyyy-MM-dd'T'HH:mm:ssZ")} | |
tags: ${post.tags.join(", ")} | |
alias: ["${post.post_url.replace("http://$blog/", "")}/"] | |
--- | |
${post.body} | |
""" | |
} | |
} | |
totalPosts = json.response.total_posts | |
offset += pageSize | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How does this fit into your workflow?