jeremyjbowers · January 17, 2013 20:22
diff --git a/tumblr_output_posts.py b/tumblr_output_posts.py
 #!/usr/bin/env python
 """
 Code which generates tumblr post outputs -- in CSV and JSON.
 """
 import json

 from bs4 import BeautifulSoup
 from csvkit.convert import convert
 import requests

 # Set up the tumblr urls and base params
 base_url = 'http://api.tumblr.com/v2/blog/inauguration2013.tumblr.com/posts/photo'

 # This should be your own app key.
 key_param = '?api_key=Cxp2JzyA03QxmQixf7Fee0oIYaFtBTTHKzRA0AveHlh094bwDH'
 limit_param = '&limit=20'
 limit = 20
 new_limit = limit

 # Make the request.
 r = requests.get(base_url + key_param)

 # Get the total number of posts so we can calculate pages.
 total_count = int(json.loads(r.content)['response']['total_posts'])
 pages_count = (total_count / limit)

 # Handle overflows. Modulus is fun.
 pages_remainder = (total_count % limit)
 if pages_remainder > 0:
    pages_count += 1

 # Set up the list of pages.
 pages = range(0, pages_count)

 # Storage for the posts.
 ALL_POSTS = []

 # Loop!
 for page in pages:

    # Handle pagination
    start_number = new_limit - limit
    end_number = new_limit

    if end_number > total_count:
        end_number = total_count

    new_limit = new_limit + limit

    page_param = '&offset=%s' % start_number
    page_url = base_url + key_param + limit_param + page_param

    # Request this page.
    r = requests.get(page_url)

    # Get the posts.
    posts = json.loads(r.content)

    # Loop!
    for post in posts['response']['posts']:

        # Intermediate storage for each post.
        post_dict = {}

        # Yeah, yeah, LXML is faster but this is easy.
        soup = BeautifulSoup(post['caption'])
        post_dict['location'] = soup.select('p.signature-name')[0].contents[2].split('from')[1]

        # Some posts error out. This is the WORST error checking.
        try:
            post_dict['message'] = soup.select('p.message')[0].contents[0].replace('<br/>', '')
        except:
            post_dict['message'] = ''

        # Only get tags related to vote type.
        for tag in post['tags']:
            if 'vot' in tag:
                post_dict['voted'] = tag

        # Append this post to the list of posts.
        ALL_POSTS.append(post_dict)

 # Turn our post list into JSON.
 all_posts_json = json.dumps(ALL_POSTS)

 # Write our JSON to a file.
 with open('posts.json', 'w') as f:
    f.write(all_posts_json)

 # Pass the JSON file buffer to csvkit and write a csv.
 with open('posts.json', 'r') as f:
    all_posts_csv = convert(f, format='json')
    with open('posts.csv', 'w') as f:
        f.write(all_posts_csv)
	#!/usr/bin/env python
	"""
	Code which generates tumblr post outputs -- in CSV and JSON.
	"""
	import json

	from bs4 import BeautifulSoup
	from csvkit.convert import convert
	import requests

	# Set up the tumblr urls and base params
	base_url = 'http://api.tumblr.com/v2/blog/inauguration2013.tumblr.com/posts/photo'

	# This should be your own app key.
	key_param = '?api_key=Cxp2JzyA03QxmQixf7Fee0oIYaFtBTTHKzRA0AveHlh094bwDH'
	limit_param = '&limit=20'
	limit = 20
	new_limit = limit

	# Make the request.
	r = requests.get(base_url + key_param)

	# Get the total number of posts so we can calculate pages.
	total_count = int(json.loads(r.content)['response']['total_posts'])
	pages_count = (total_count / limit)

	# Handle overflows. Modulus is fun.
	pages_remainder = (total_count % limit)
	if pages_remainder > 0:
	pages_count += 1

	# Set up the list of pages.
	pages = range(0, pages_count)

	# Storage for the posts.
	ALL_POSTS = []

	# Loop!
	for page in pages:

	# Handle pagination
	start_number = new_limit - limit
	end_number = new_limit

	if end_number > total_count:
	end_number = total_count

	new_limit = new_limit + limit

	page_param = '&offset=%s' % start_number
	page_url = base_url + key_param + limit_param + page_param

	# Request this page.
	r = requests.get(page_url)

	# Get the posts.
	posts = json.loads(r.content)

	# Loop!
	for post in posts['response']['posts']:

	# Intermediate storage for each post.
	post_dict = {}

	# Yeah, yeah, LXML is faster but this is easy.
	soup = BeautifulSoup(post['caption'])
	post_dict['location'] = soup.select('p.signature-name')[0].contents[2].split('from')[1]

	# Some posts error out. This is the WORST error checking.
	try:
	post_dict['message'] = soup.select('p.message')[0].contents[0].replace('<br/>', '')
	except:
	post_dict['message'] = ''

	# Only get tags related to vote type.
	for tag in post['tags']:
	if 'vot' in tag:
	post_dict['voted'] = tag

	# Append this post to the list of posts.
	ALL_POSTS.append(post_dict)

	# Turn our post list into JSON.
	all_posts_json = json.dumps(ALL_POSTS)

	# Write our JSON to a file.
	with open('posts.json', 'w') as f:
	f.write(all_posts_json)

	# Pass the JSON file buffer to csvkit and write a csv.
	with open('posts.json', 'r') as f:
	all_posts_csv = convert(f, format='json')
	with open('posts.csv', 'w') as f:
	f.write(all_posts_csv)
No results found