Created
January 17, 2013 20:22
-
-
Save jeremyjbowers/4559388 to your computer and use it in GitHub Desktop.
This outputs tumblr posts to JSON and CSV. Requires beautifulsoup4, csvkit and requests.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Code which generates tumblr post outputs -- in CSV and JSON. | |
""" | |
import json | |
from bs4 import BeautifulSoup | |
from csvkit.convert import convert | |
import requests | |
# Set up the tumblr urls and base params | |
base_url = 'http://api.tumblr.com/v2/blog/inauguration2013.tumblr.com/posts/photo' | |
# This should be your own app key. | |
key_param = '?api_key=Cxp2JzyA03QxmQixf7Fee0oIYaFtBTTHKzRA0AveHlh094bwDH' | |
limit_param = '&limit=20' | |
limit = 20 | |
new_limit = limit | |
# Make the request. | |
r = requests.get(base_url + key_param) | |
# Get the total number of posts so we can calculate pages. | |
total_count = int(json.loads(r.content)['response']['total_posts']) | |
pages_count = (total_count / limit) | |
# Handle overflows. Modulus is fun. | |
pages_remainder = (total_count % limit) | |
if pages_remainder > 0: | |
pages_count += 1 | |
# Set up the list of pages. | |
pages = range(0, pages_count) | |
# Storage for the posts. | |
ALL_POSTS = [] | |
# Loop! | |
for page in pages: | |
# Handle pagination | |
start_number = new_limit - limit | |
end_number = new_limit | |
if end_number > total_count: | |
end_number = total_count | |
new_limit = new_limit + limit | |
page_param = '&offset=%s' % start_number | |
page_url = base_url + key_param + limit_param + page_param | |
# Request this page. | |
r = requests.get(page_url) | |
# Get the posts. | |
posts = json.loads(r.content) | |
# Loop! | |
for post in posts['response']['posts']: | |
# Intermediate storage for each post. | |
post_dict = {} | |
# Yeah, yeah, LXML is faster but this is easy. | |
soup = BeautifulSoup(post['caption']) | |
post_dict['location'] = soup.select('p.signature-name')[0].contents[2].split('from')[1] | |
# Some posts error out. This is the WORST error checking. | |
try: | |
post_dict['message'] = soup.select('p.message')[0].contents[0].replace('<br/>', '') | |
except: | |
post_dict['message'] = '' | |
# Only get tags related to vote type. | |
for tag in post['tags']: | |
if 'vot' in tag: | |
post_dict['voted'] = tag | |
# Append this post to the list of posts. | |
ALL_POSTS.append(post_dict) | |
# Turn our post list into JSON. | |
all_posts_json = json.dumps(ALL_POSTS) | |
# Write our JSON to a file. | |
with open('posts.json', 'w') as f: | |
f.write(all_posts_json) | |
# Pass the JSON file buffer to csvkit and write a csv. | |
with open('posts.json', 'r') as f: | |
all_posts_csv = convert(f, format='json') | |
with open('posts.csv', 'w') as f: | |
f.write(all_posts_csv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment