Created
February 3, 2014 19:27
-
-
Save Plutor/8790603 to your computer and use it in GitHub Desktop.
Get reddit Super Bowl threads comment list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
import urllib | |
import urllib2 | |
from time import sleep | |
comments = 0 | |
missing = [] | |
csv = open('super-bowl-comments.csv', 'w') | |
thread_ids = ['1wuf5a', | |
'1wudme', '1wtyp4', '1wudme', '1wuk91', '1wuo3p', | |
'1wuq5y', '1wuw3l', '1wuyds', '1wuf5a' | |
] | |
url_pattern = 'http://www.reddit.com/comments/%s.json?depth=0' | |
n = 0 | |
def get_url_comments(url, data=None): | |
global n | |
n += 1 | |
try: | |
if (data != None): | |
data = urllib.urlencode(data) | |
data = data.encode('ascii') | |
print("Getting %s" % url) | |
request = urllib2.Request(url, data) | |
request.add_header('User-Agent', | |
'/r/nfl thread snapshotter by /u/plutor') | |
request.add_header('if-modified-since', | |
'Fri, 29 May 2007 08:34:38 GMT') | |
response = urllib2.urlopen(request) | |
content = response.read() | |
print(" Got %d bytes" % len(content)) | |
if (len(content) < 100): | |
print(content) | |
# save data | |
jf = open('data%04d.json' % n, 'w') | |
jf.write(str(content)) | |
jf.close() | |
# parse json | |
data = json.loads(content.decode("utf8"), object_hook=hook) | |
response.close() | |
print(" Got %d comments" % comments) | |
sleep(2) | |
except urllib2.URLError as e: | |
print("Got error %s" % e) | |
def hook(json_data): | |
global missing, csv, comments | |
if "kind" in json_data: | |
if json_data["kind"] == "t1": | |
comments += 1 | |
body = json_data["data"]["body"] | |
body_len = len(body.encode('ascii', 'replace')) | |
csv.write('"%s","%s","%s",%d\n' % (json_data["data"]["author"], | |
json_data["data"]["created"], | |
json_data["data"]["author_flair_text"], | |
body_len)) | |
if (json_data["kind"] == "more" and "data" in json_data and "children" in json_data["data"]): | |
print(" Adding %d more to queue" % len(json_data["data"]["children"])) | |
for child in json_data["data"]["children"]: | |
missing.append(child) | |
return json_data | |
############# | |
for thread_id in thread_ids: | |
print 'Now retrieving thread %s' % thread_id | |
url = url_pattern % thread_id | |
get_url_comments(url) | |
while (len(missing) > 0): | |
toget = missing[:20] | |
missing = missing[21:] | |
print("Missing queue: %d -- getting %d" % (len(missing), len(toget))) | |
moreurl = "http://www.reddit.com/api/morechildren.json" | |
params = {'children': ','.join(toget), | |
'link_id': "t3_%s" % thread_id, | |
'r': "nfl"} | |
get_url_comments(moreurl, params) | |
csv.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment