Created
March 29, 2013 22:19
-
-
Save akesling/5274075 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2013 [email protected] | |
import requests | |
import urllib | |
import datetime | |
import time | |
import os | |
import json | |
RUN_ID = int(time.mktime(datetime.datetime.now().timetuple())) | |
MAX_POSTS_REQUESTED = 1000 | |
HTTP_SUCCESS = 200 | |
CNN_TECH = 'http://rss.cnn.com/rss/cnn_tech.rss' | |
def sow(feed, delay=0, posts=1000, directory='chunks'): | |
path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'items')) | |
if not os.path.exists(path): | |
os.makedirs(path) | |
post_inc = min(MAX_POSTS_REQUESTED, posts) | |
num_items = 0 | |
cur = retrieve(feed, posts=post_inc) | |
while cur.status_code == HTTP_SUCCESS and num_items < posts: | |
with open(os.path.join(path, '%s.json'%num_items), 'w+') as f: | |
f.write(cur.text.encode('utf8')) | |
obj = cur.json() | |
cur = retrieve(feed, posts=post_inc, continuation=obj['continuation']) | |
num_items += len(obj['items']) | |
time.sleep(delay) | |
if cur.status_code != HTTP_SUCCESS: | |
print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (cur.status_code, cur.url) | |
else: | |
print 'Gracefully exiting after guzzling %s posts.' % num_items | |
def retrieve(feed, posts=1000, continuation=None): | |
url = 'http://www.google.com/reader/api/0/stream/contents/feed/%s' | |
get_params = { | |
'allcomments': 'false', | |
'output': 'json', | |
'ck': int(time.mktime(datetime.datetime.now().timetuple())), | |
'ot': 0, | |
'n': posts, | |
'client': 'scroll', | |
} | |
if continuation: | |
get_params['c'] = continuation | |
return requests.get(url % urllib.quote_plus(feed), params=get_params) | |
def scrape(items, directory='chunk', delay=0): | |
path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'pages')) | |
if not os.path.exists(path): | |
os.makedirs(path) | |
for i in items: | |
try: | |
href = i['canonical'][0]['href'] | |
page = requests.get(href) | |
if page.status_code == HTTP_SUCCESS: | |
with open(os.path.join(path, urllib.quote_plus(href)), 'w+') as f: | |
f.write(page.text.encode('utf8')) | |
print 'Great success for %s!' % (page.url) | |
else: | |
print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (page.status_code, page.url) | |
except requests.ConnectionError: | |
pass | |
def reap(directory): | |
for f in os.listdir(directory): | |
path = os.path.join(directory, f) | |
if os.path.isfile(path): | |
with open(path, 'r') as feedfile: | |
feed = json.load(feedfile) | |
scrape(feed['items'], directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment