Skip to content

Instantly share code, notes, and snippets.

@Plutor
Created February 3, 2014 19:27
Show Gist options
  • Save Plutor/8790603 to your computer and use it in GitHub Desktop.
Save Plutor/8790603 to your computer and use it in GitHub Desktop.
Get reddit Super Bowl threads comment list
import json
import re
import urllib
import urllib2
from time import sleep
comments = 0
missing = []
csv = open('super-bowl-comments.csv', 'w')
thread_ids = ['1wuf5a',
'1wudme', '1wtyp4', '1wudme', '1wuk91', '1wuo3p',
'1wuq5y', '1wuw3l', '1wuyds', '1wuf5a'
]
url_pattern = 'http://www.reddit.com/comments/%s.json?depth=0'
n = 0
def get_url_comments(url, data=None):
global n
n += 1
try:
if (data != None):
data = urllib.urlencode(data)
data = data.encode('ascii')
print("Getting %s" % url)
request = urllib2.Request(url, data)
request.add_header('User-Agent',
'/r/nfl thread snapshotter by /u/plutor')
request.add_header('if-modified-since',
'Fri, 29 May 2007 08:34:38 GMT')
response = urllib2.urlopen(request)
content = response.read()
print(" Got %d bytes" % len(content))
if (len(content) < 100):
print(content)
# save data
jf = open('data%04d.json' % n, 'w')
jf.write(str(content))
jf.close()
# parse json
data = json.loads(content.decode("utf8"), object_hook=hook)
response.close()
print(" Got %d comments" % comments)
sleep(2)
except urllib2.URLError as e:
print("Got error %s" % e)
def hook(json_data):
global missing, csv, comments
if "kind" in json_data:
if json_data["kind"] == "t1":
comments += 1
body = json_data["data"]["body"]
body_len = len(body.encode('ascii', 'replace'))
csv.write('"%s","%s","%s",%d\n' % (json_data["data"]["author"],
json_data["data"]["created"],
json_data["data"]["author_flair_text"],
body_len))
if (json_data["kind"] == "more" and "data" in json_data and "children" in json_data["data"]):
print(" Adding %d more to queue" % len(json_data["data"]["children"]))
for child in json_data["data"]["children"]:
missing.append(child)
return json_data
#############
for thread_id in thread_ids:
print 'Now retrieving thread %s' % thread_id
url = url_pattern % thread_id
get_url_comments(url)
while (len(missing) > 0):
toget = missing[:20]
missing = missing[21:]
print("Missing queue: %d -- getting %d" % (len(missing), len(toget)))
moreurl = "http://www.reddit.com/api/morechildren.json"
params = {'children': ','.join(toget),
'link_id': "t3_%s" % thread_id,
'r': "nfl"}
get_url_comments(moreurl, params)
csv.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment