Skip to content

Instantly share code, notes, and snippets.

@nikolak
Last active January 3, 2016 03:39
Show Gist options
  • Select an option

  • Save nikolak/8403499 to your computer and use it in GitHub Desktop.

Select an option

Save nikolak/8403499 to your computer and use it in GitHub Desktop.
Gets all items possible from entered reddit pages following the &after= param
#!/usr/bin/env python
import sys
import json
import time
import urllib
import httplib2
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
from optparse import OptionParser
from xml.sax.saxutils import escape as escape_html
from xml.sax.saxutils import unescape as unescape_html
# please don't hurt reddit
fetch_size = 100 # the higher the better, but reddit ignores +100
sleep_time = 1 # in seconds. how long to sleep between
# requests. higher is better
request_limit = None # how many requests to make to reddit before
# stopping (set to None to disable)
debug = False
http = httplib2.Http()
def login(username, password):
url = 'http://www.reddit.com/api/login/%s' % username
body = {'user': username,
'passwd': password}
headers = {'Content-type': 'application/x-www-form-urlencoded'}
try:
response, content = http.request(url, 'POST', headers=headers, body=urlencode(body))
except Exception, e:
print "Could not login"
print e
sys.exit(1)
return response['set-cookie']
def get_links(sourceurl, login_cookie = '', requests = 0):
'''
Given a reddit JSON URL, yield the JSON Link API objects,
following 'after' links
'''
# rip apart the URL, make sure it has .json at the end, and set
# the limit
scheme, host, path, params, query, fragment = urlparse(sourceurl)
parsed_params = parse_qs(query) if query else {}
parsed_params['limit'] = [fetch_size]
fragment = None # erase the fragment, we don't use it
# assert path.endswith('.json') or path.endswith('/')
# if path.endswith('/'):
# path = path + '.json'
new_urltuple = (scheme, host, path, params,
urlencode(parsed_params, doseq = True), fragment)
composed_sourceurl = urlunparse(new_urltuple)
if debug:
sys.stderr.write('fetching %s\n' % composed_sourceurl)
if login_cookie:
headers = {'Cookie': login_cookie}
response, text = http.request(composed_sourceurl, 'GET', headers=headers)
else:
text = urllib.urlopen(composed_sourceurl).read()
parsed = json.loads(text)
# there may be multiple listings, like on a comments-page, but we
# can only export from pages with one listing
assert parsed['kind'] == 'Listing'
listing = parsed['data']
for child in listing.get('children', []):
yield child
if (listing.get('after', None)
and (request_limit is None
or requests < request_limit - 1)):
after_parsed_params = parsed_params.copy()
after_parsed_params['after'] = [listing['after']]
after_urltuple = (scheme, host, path, params,
urlencode(after_parsed_params, doseq = True),
fragment)
after_sourceurl = urlunparse(after_urltuple)
time.sleep(sleep_time)
# yes, this is recursive, but if you're making enough requests
# to blow out your stack, you're probably hurting reddit
for link in get_links(after_sourceurl, login_cookie, requests+1):
yield link
def main(sourceurl, username = None, password = None):
'''
Given a reddit JSON url, yield unicode strings that represent the
exported HTML
'''
cookie = None
if username and password:
cookie = login(username, password)
for link in get_links(sourceurl, cookie):
data = link['data']
if link['kind'] == 't3':
# links
yield "{},".format(escape_html(data['id']))
elif link['kind'] == 't1':
# comments
yield "{},".format(escape_html(data['id']))
else:
raise TypeError("I don't know how to decode %r" % link)
def make_stats(input_file):
with open(input_file,"r") as in_file:
_,top,new,cont=[l.split(",") for l in in_file.read().split("\n")]
for k,v in {"top":top,"new":new,"controversial":cont}.items():
print "{} has {} items".format(k,len(v))
print "Total number of unique items:{}".format(len(list(set(top+new+cont))))
if __name__=='__main__':
username='wub_wub'
password='hunter2'
sources={"top":'http://www.reddit.com/user/wub_wub/.json?sort=top',
"new":'http://www.reddit.com/user/wub_wub/.json?sort=new',
"controversial":'http://www.reddit.com/user/wub_wub/.json?sort=controversial'
}
debug = True
for k,v in sources.items():
with open("data","a") as output:
output.write("\n")
for i in main(v,username,password):
output.write(i)
make_stats("data")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment