Skip to content

Instantly share code, notes, and snippets.

@jeremyjbowers
Created January 17, 2013 20:22
Show Gist options
  • Save jeremyjbowers/4559388 to your computer and use it in GitHub Desktop.
Save jeremyjbowers/4559388 to your computer and use it in GitHub Desktop.
This outputs tumblr posts to JSON and CSV. Requires beautifulsoup4, csvkit and requests.
#!/usr/bin/env python
"""
Code which generates tumblr post outputs -- in CSV and JSON.
"""
import json
from bs4 import BeautifulSoup
from csvkit.convert import convert
import requests
# Set up the tumblr urls and base params
base_url = 'http://api.tumblr.com/v2/blog/inauguration2013.tumblr.com/posts/photo'
# This should be your own app key.
key_param = '?api_key=Cxp2JzyA03QxmQixf7Fee0oIYaFtBTTHKzRA0AveHlh094bwDH'
limit_param = '&limit=20'
limit = 20
new_limit = limit
# Make the request.
r = requests.get(base_url + key_param)
# Get the total number of posts so we can calculate pages.
total_count = int(json.loads(r.content)['response']['total_posts'])
pages_count = (total_count / limit)
# Handle overflows. Modulus is fun.
pages_remainder = (total_count % limit)
if pages_remainder > 0:
pages_count += 1
# Set up the list of pages.
pages = range(0, pages_count)
# Storage for the posts.
ALL_POSTS = []
# Loop!
for page in pages:
# Handle pagination
start_number = new_limit - limit
end_number = new_limit
if end_number > total_count:
end_number = total_count
new_limit = new_limit + limit
page_param = '&offset=%s' % start_number
page_url = base_url + key_param + limit_param + page_param
# Request this page.
r = requests.get(page_url)
# Get the posts.
posts = json.loads(r.content)
# Loop!
for post in posts['response']['posts']:
# Intermediate storage for each post.
post_dict = {}
# Yeah, yeah, LXML is faster but this is easy.
soup = BeautifulSoup(post['caption'])
post_dict['location'] = soup.select('p.signature-name')[0].contents[2].split('from')[1]
# Some posts error out. This is the WORST error checking.
try:
post_dict['message'] = soup.select('p.message')[0].contents[0].replace('<br/>', '')
except:
post_dict['message'] = ''
# Only get tags related to vote type.
for tag in post['tags']:
if 'vot' in tag:
post_dict['voted'] = tag
# Append this post to the list of posts.
ALL_POSTS.append(post_dict)
# Turn our post list into JSON.
all_posts_json = json.dumps(ALL_POSTS)
# Write our JSON to a file.
with open('posts.json', 'w') as f:
f.write(all_posts_json)
# Pass the JSON file buffer to csvkit and write a csv.
with open('posts.json', 'r') as f:
all_posts_csv = convert(f, format='json')
with open('posts.csv', 'w') as f:
f.write(all_posts_csv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment