nikolak · January 3, 2016 03:39
diff --git a/gistfile1.py b/gistfile1.py
 #!/usr/bin/env python

 import sys
 import json
 import time
 import urllib
 import httplib2
 from urllib import urlencode
 from urlparse import urlparse, urlunparse, parse_qs

 from optparse import OptionParser

 from xml.sax.saxutils import escape as escape_html
 from xml.sax.saxutils import unescape as unescape_html

 # please don't hurt reddit
 fetch_size = 100     # the higher the better, but reddit ignores +100
 sleep_time = 1       # in seconds. how long to sleep between
                     # requests. higher is better
 request_limit = None # how many requests to make to reddit before
                     # stopping (set to None to disable)

 debug = False

 http = httplib2.Http()


 def login(username, password):
    url = 'http://www.reddit.com/api/login/%s' % username
    body = {'user': username,
            'passwd': password}
    headers = {'Content-type': 'application/x-www-form-urlencoded'}

    try:
        response, content = http.request(url, 'POST', headers=headers, body=urlencode(body))
    except Exception, e:
        print "Could not login"
        print e
        sys.exit(1)

    return response['set-cookie']

 def get_links(sourceurl, login_cookie = '', requests = 0):
    '''
    Given a reddit JSON URL, yield the JSON Link API objects,
    following 'after' links
    '''
    # rip apart the URL, make sure it has .json at the end, and set
    # the limit
    scheme, host, path, params, query, fragment = urlparse(sourceurl)

    parsed_params = parse_qs(query) if query else {}
    parsed_params['limit'] = [fetch_size]
    fragment = None # erase the fragment, we don't use it
    # assert path.endswith('.json') or path.endswith('/')
    # if path.endswith('/'):
    #     path = path + '.json'

    new_urltuple = (scheme, host, path, params,
                    urlencode(parsed_params, doseq = True), fragment)
    composed_sourceurl = urlunparse(new_urltuple)

    if debug:
        sys.stderr.write('fetching %s\n' % composed_sourceurl)

    if login_cookie:
        headers = {'Cookie': login_cookie}
        response, text = http.request(composed_sourceurl, 'GET', headers=headers)
    else:
        text = urllib.urlopen(composed_sourceurl).read()
    parsed = json.loads(text)

    # there may be multiple listings, like on a comments-page, but we
    # can only export from pages with one listing
    assert parsed['kind'] == 'Listing'

    listing = parsed['data']

    for child in listing.get('children', []):
        yield child

    if (listing.get('after', None)
        and (request_limit is None
             or requests < request_limit - 1)):
        after_parsed_params = parsed_params.copy()
        after_parsed_params['after'] = [listing['after']]
        after_urltuple = (scheme, host, path, params,
                          urlencode(after_parsed_params, doseq = True),
                          fragment)
        after_sourceurl = urlunparse(after_urltuple)

        time.sleep(sleep_time)

        # yes, this is recursive, but if you're making enough requests
        # to blow out your stack, you're probably hurting reddit
        for link in get_links(after_sourceurl, login_cookie, requests+1):
            yield link

 def main(sourceurl, username = None, password = None):
    '''
    Given a reddit JSON url, yield unicode strings that represent the
    exported HTML
    '''
    cookie = None

    if username and password:
        cookie = login(username, password)

    for link in get_links(sourceurl, cookie):
        data = link['data']

        if link['kind'] == 't3':
            # links
            yield "{},".format(escape_html(data['id']))

        elif link['kind'] == 't1':
            # comments
            yield "{},".format(escape_html(data['id']))

        else:
            raise TypeError("I don't know how to decode %r" % link)

 def make_stats(input_file):
    with open(input_file,"r") as in_file:
        _,top,new,cont=[l.split(",") for l in in_file.read().split("\n")]
    for k,v in {"top":top,"new":new,"controversial":cont}.items():
        print "{} has {} items".format(k,len(v))

    print "Total number of unique items:{}".format(len(list(set(top+new+cont))))


 if __name__=='__main__':
    username='wub_wub'
    password='hunter2'
    sources={"top":'http://www.reddit.com/user/wub_wub/.json?sort=top',
            "new":'http://www.reddit.com/user/wub_wub/.json?sort=new',
            "controversial":'http://www.reddit.com/user/wub_wub/.json?sort=controversial'
            }

    debug = True

    for k,v in sources.items():
        with open("data","a") as output:
            output.write("\n")
            for i in main(v,username,password):
                output.write(i)

    make_stats("data")
	#!/usr/bin/env python

	import sys
	import json
	import time
	import urllib
	import httplib2
	from urllib import urlencode
	from urlparse import urlparse, urlunparse, parse_qs

	from optparse import OptionParser

	from xml.sax.saxutils import escape as escape_html
	from xml.sax.saxutils import unescape as unescape_html

	# please don't hurt reddit
	fetch_size = 100 # the higher the better, but reddit ignores +100
	sleep_time = 1 # in seconds. how long to sleep between
	# requests. higher is better
	request_limit = None # how many requests to make to reddit before
	# stopping (set to None to disable)

	debug = False

	http = httplib2.Http()


	def login(username, password):
	url = 'http://www.reddit.com/api/login/%s' % username
	body = {'user': username,
	'passwd': password}
	headers = {'Content-type': 'application/x-www-form-urlencoded'}

	try:
	response, content = http.request(url, 'POST', headers=headers, body=urlencode(body))
	except Exception, e:
	print "Could not login"
	print e
	sys.exit(1)

	return response['set-cookie']

	def get_links(sourceurl, login_cookie = '', requests = 0):
	'''
	Given a reddit JSON URL, yield the JSON Link API objects,
	following 'after' links
	'''
	# rip apart the URL, make sure it has .json at the end, and set
	# the limit
	scheme, host, path, params, query, fragment = urlparse(sourceurl)

	parsed_params = parse_qs(query) if query else {}
	parsed_params['limit'] = [fetch_size]
	fragment = None # erase the fragment, we don't use it
	# assert path.endswith('.json') or path.endswith('/')
	# if path.endswith('/'):
	# path = path + '.json'

	new_urltuple = (scheme, host, path, params,
	urlencode(parsed_params, doseq = True), fragment)
	composed_sourceurl = urlunparse(new_urltuple)

	if debug:
	sys.stderr.write('fetching %s\n' % composed_sourceurl)

	if login_cookie:
	headers = {'Cookie': login_cookie}
	response, text = http.request(composed_sourceurl, 'GET', headers=headers)
	else:
	text = urllib.urlopen(composed_sourceurl).read()
	parsed = json.loads(text)

	# there may be multiple listings, like on a comments-page, but we
	# can only export from pages with one listing
	assert parsed['kind'] == 'Listing'

	listing = parsed['data']

	for child in listing.get('children', []):
	yield child

	if (listing.get('after', None)
	and (request_limit is None
	or requests < request_limit - 1)):
	after_parsed_params = parsed_params.copy()
	after_parsed_params['after'] = [listing['after']]
	after_urltuple = (scheme, host, path, params,
	urlencode(after_parsed_params, doseq = True),
	fragment)
	after_sourceurl = urlunparse(after_urltuple)

	time.sleep(sleep_time)

	# yes, this is recursive, but if you're making enough requests
	# to blow out your stack, you're probably hurting reddit
	for link in get_links(after_sourceurl, login_cookie, requests+1):
	yield link

	def main(sourceurl, username = None, password = None):
	'''
	Given a reddit JSON url, yield unicode strings that represent the
	exported HTML
	'''
	cookie = None

	if username and password:
	cookie = login(username, password)

	for link in get_links(sourceurl, cookie):
	data = link['data']

	if link['kind'] == 't3':
	# links
	yield "{},".format(escape_html(data['id']))

	elif link['kind'] == 't1':
	# comments
	yield "{},".format(escape_html(data['id']))

	else:
	raise TypeError("I don't know how to decode %r" % link)

	def make_stats(input_file):
	with open(input_file,"r") as in_file:
	_,top,new,cont=[l.split(",") for l in in_file.read().split("\n")]
	for k,v in {"top":top,"new":new,"controversial":cont}.items():
	print "{} has {} items".format(k,len(v))

	print "Total number of unique items:{}".format(len(list(set(top+new+cont))))


	if __name__=='__main__':
	username='wub_wub'
	password='hunter2'
	sources={"top":'http://www.reddit.com/user/wub_wub/.json?sort=top',
	"new":'http://www.reddit.com/user/wub_wub/.json?sort=new',
	"controversial":'http://www.reddit.com/user/wub_wub/.json?sort=controversial'
	}

	debug = True

	for k,v in sources.items():
	with open("data","a") as output:
	output.write("\n")
	for i in main(v,username,password):
	output.write(i)

	make_stats("data")
No results found