Last active
January 3, 2016 03:39
-
-
Save nikolak/8403499 to your computer and use it in GitHub Desktop.
Gets all items possible from entered reddit pages following the &after= param
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import sys | |
| import json | |
| import time | |
| import urllib | |
| import httplib2 | |
| from urllib import urlencode | |
| from urlparse import urlparse, urlunparse, parse_qs | |
| from optparse import OptionParser | |
| from xml.sax.saxutils import escape as escape_html | |
| from xml.sax.saxutils import unescape as unescape_html | |
| # please don't hurt reddit | |
| fetch_size = 100 # the higher the better, but reddit ignores +100 | |
| sleep_time = 1 # in seconds. how long to sleep between | |
| # requests. higher is better | |
| request_limit = None # how many requests to make to reddit before | |
| # stopping (set to None to disable) | |
| debug = False | |
| http = httplib2.Http() | |
| def login(username, password): | |
| url = 'http://www.reddit.com/api/login/%s' % username | |
| body = {'user': username, | |
| 'passwd': password} | |
| headers = {'Content-type': 'application/x-www-form-urlencoded'} | |
| try: | |
| response, content = http.request(url, 'POST', headers=headers, body=urlencode(body)) | |
| except Exception, e: | |
| print "Could not login" | |
| print e | |
| sys.exit(1) | |
| return response['set-cookie'] | |
| def get_links(sourceurl, login_cookie = '', requests = 0): | |
| ''' | |
| Given a reddit JSON URL, yield the JSON Link API objects, | |
| following 'after' links | |
| ''' | |
| # rip apart the URL, make sure it has .json at the end, and set | |
| # the limit | |
| scheme, host, path, params, query, fragment = urlparse(sourceurl) | |
| parsed_params = parse_qs(query) if query else {} | |
| parsed_params['limit'] = [fetch_size] | |
| fragment = None # erase the fragment, we don't use it | |
| # assert path.endswith('.json') or path.endswith('/') | |
| # if path.endswith('/'): | |
| # path = path + '.json' | |
| new_urltuple = (scheme, host, path, params, | |
| urlencode(parsed_params, doseq = True), fragment) | |
| composed_sourceurl = urlunparse(new_urltuple) | |
| if debug: | |
| sys.stderr.write('fetching %s\n' % composed_sourceurl) | |
| if login_cookie: | |
| headers = {'Cookie': login_cookie} | |
| response, text = http.request(composed_sourceurl, 'GET', headers=headers) | |
| else: | |
| text = urllib.urlopen(composed_sourceurl).read() | |
| parsed = json.loads(text) | |
| # there may be multiple listings, like on a comments-page, but we | |
| # can only export from pages with one listing | |
| assert parsed['kind'] == 'Listing' | |
| listing = parsed['data'] | |
| for child in listing.get('children', []): | |
| yield child | |
| if (listing.get('after', None) | |
| and (request_limit is None | |
| or requests < request_limit - 1)): | |
| after_parsed_params = parsed_params.copy() | |
| after_parsed_params['after'] = [listing['after']] | |
| after_urltuple = (scheme, host, path, params, | |
| urlencode(after_parsed_params, doseq = True), | |
| fragment) | |
| after_sourceurl = urlunparse(after_urltuple) | |
| time.sleep(sleep_time) | |
| # yes, this is recursive, but if you're making enough requests | |
| # to blow out your stack, you're probably hurting reddit | |
| for link in get_links(after_sourceurl, login_cookie, requests+1): | |
| yield link | |
| def main(sourceurl, username = None, password = None): | |
| ''' | |
| Given a reddit JSON url, yield unicode strings that represent the | |
| exported HTML | |
| ''' | |
| cookie = None | |
| if username and password: | |
| cookie = login(username, password) | |
| for link in get_links(sourceurl, cookie): | |
| data = link['data'] | |
| if link['kind'] == 't3': | |
| # links | |
| yield "{},".format(escape_html(data['id'])) | |
| elif link['kind'] == 't1': | |
| # comments | |
| yield "{},".format(escape_html(data['id'])) | |
| else: | |
| raise TypeError("I don't know how to decode %r" % link) | |
| def make_stats(input_file): | |
| with open(input_file,"r") as in_file: | |
| _,top,new,cont=[l.split(",") for l in in_file.read().split("\n")] | |
| for k,v in {"top":top,"new":new,"controversial":cont}.items(): | |
| print "{} has {} items".format(k,len(v)) | |
| print "Total number of unique items:{}".format(len(list(set(top+new+cont)))) | |
| if __name__=='__main__': | |
| username='wub_wub' | |
| password='hunter2' | |
| sources={"top":'http://www.reddit.com/user/wub_wub/.json?sort=top', | |
| "new":'http://www.reddit.com/user/wub_wub/.json?sort=new', | |
| "controversial":'http://www.reddit.com/user/wub_wub/.json?sort=controversial' | |
| } | |
| debug = True | |
| for k,v in sources.items(): | |
| with open("data","a") as output: | |
| output.write("\n") | |
| for i in main(v,username,password): | |
| output.write(i) | |
| make_stats("data") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment