Plutor · February 3, 2014 19:27
diff --git a/get_superbowl_comments.py b/get_superbowl_comments.py
 import json
 import re
 import urllib
 import urllib2
 from time import sleep

 comments = 0
 missing = []
 csv = open('super-bowl-comments.csv', 'w')

 thread_ids = ['1wuf5a',
              '1wudme', '1wtyp4', '1wudme', '1wuk91', '1wuo3p',
              '1wuq5y', '1wuw3l', '1wuyds', '1wuf5a'
             ]
 url_pattern = 'http://www.reddit.com/comments/%s.json?depth=0'

 n = 0

 def get_url_comments(url, data=None):
    global n
    n += 1
    try:
        if (data != None):
            data = urllib.urlencode(data)
            data = data.encode('ascii')

        print("Getting %s" % url)
        request = urllib2.Request(url, data)
        request.add_header('User-Agent',
                           '/r/nfl thread snapshotter by /u/plutor')
        request.add_header('if-modified-since',
                           'Fri, 29 May 2007 08:34:38 GMT')
        response = urllib2.urlopen(request)
        content = response.read()
        print("  Got %d bytes" % len(content))

        if (len(content) < 100):
            print(content)

        # save data
        jf = open('data%04d.json' % n, 'w')
        jf.write(str(content))
        jf.close()

        # parse json
        data = json.loads(content.decode("utf8"), object_hook=hook)
        response.close()

        print("  Got %d comments" % comments)
        sleep(2)
    except urllib2.URLError as e:
        print("Got error %s" % e)


 def hook(json_data):
    global missing, csv, comments
    if "kind" in json_data:
      if json_data["kind"] == "t1":
        comments += 1
        body = json_data["data"]["body"]
        body_len = len(body.encode('ascii', 'replace'))
        csv.write('"%s","%s","%s",%d\n' % (json_data["data"]["author"],
                                           json_data["data"]["created"],
                                           json_data["data"]["author_flair_text"],
                                           body_len))
      if (json_data["kind"] == "more" and "data" in json_data and "children" in json_data["data"]):
        print("  Adding %d more to queue" % len(json_data["data"]["children"]))
        for child in json_data["data"]["children"]:
          missing.append(child)
    return json_data

 #############

 for thread_id in thread_ids:
  print 'Now retrieving thread %s' % thread_id
  url = url_pattern % thread_id
  get_url_comments(url)
  while (len(missing) > 0):
      toget = missing[:20]
      missing = missing[21:]
      print("Missing queue: %d -- getting %d" % (len(missing), len(toget)))
      moreurl = "http://www.reddit.com/api/morechildren.json"
      params = {'children': ','.join(toget),
                'link_id': "t3_%s" % thread_id,
                'r': "nfl"}
      get_url_comments(moreurl, params)

 csv.close()
	import json
	import re
	import urllib
	import urllib2
	from time import sleep

	comments = 0
	missing = []
	csv = open('super-bowl-comments.csv', 'w')

	thread_ids = ['1wuf5a',
	'1wudme', '1wtyp4', '1wudme', '1wuk91', '1wuo3p',
	'1wuq5y', '1wuw3l', '1wuyds', '1wuf5a'
	]
	url_pattern = 'http://www.reddit.com/comments/%s.json?depth=0'

	n = 0

	def get_url_comments(url, data=None):
	global n
	n += 1
	try:
	if (data != None):
	data = urllib.urlencode(data)
	data = data.encode('ascii')

	print("Getting %s" % url)
	request = urllib2.Request(url, data)
	request.add_header('User-Agent',
	'/r/nfl thread snapshotter by /u/plutor')
	request.add_header('if-modified-since',
	'Fri, 29 May 2007 08:34:38 GMT')
	response = urllib2.urlopen(request)
	content = response.read()
	print(" Got %d bytes" % len(content))

	if (len(content) < 100):
	print(content)

	# save data
	jf = open('data%04d.json' % n, 'w')
	jf.write(str(content))
	jf.close()

	# parse json
	data = json.loads(content.decode("utf8"), object_hook=hook)
	response.close()

	print(" Got %d comments" % comments)
	sleep(2)
	except urllib2.URLError as e:
	print("Got error %s" % e)


	def hook(json_data):
	global missing, csv, comments
	if "kind" in json_data:
	if json_data["kind"] == "t1":
	comments += 1
	body = json_data["data"]["body"]
	body_len = len(body.encode('ascii', 'replace'))
	csv.write('"%s","%s","%s",%d\n' % (json_data["data"]["author"],
	json_data["data"]["created"],
	json_data["data"]["author_flair_text"],
	body_len))
	if (json_data["kind"] == "more" and "data" in json_data and "children" in json_data["data"]):
	print(" Adding %d more to queue" % len(json_data["data"]["children"]))
	for child in json_data["data"]["children"]:
	missing.append(child)
	return json_data

	#############

	for thread_id in thread_ids:
	print 'Now retrieving thread %s' % thread_id
	url = url_pattern % thread_id
	get_url_comments(url)
	while (len(missing) > 0):
	toget = missing[:20]
	missing = missing[21:]
	print("Missing queue: %d -- getting %d" % (len(missing), len(toget)))
	moreurl = "http://www.reddit.com/api/morechildren.json"
	params = {'children': ','.join(toget),
	'link_id': "t3_%s" % thread_id,
	'r': "nfl"}
	get_url_comments(moreurl, params)

	csv.close()