Last active
July 27, 2019 06:46
-
-
Save mdamien/36d05bc75423bc2c88ac1eb24b8a11f2 to your computer and use it in GitHub Desktop.
parse_reddit.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, csv | |
# optional: makes requests-cache make it easy to cache your results | |
# import requests_cache | |
# requests_cache.install_cache('reddit_cache') | |
# where we store all the rows | |
DATA = [] | |
def get(url, args={}): | |
print('GET', url, args) | |
try: | |
resp = requests.get(url, args, headers={'User-agent': 'linkage.fr'}) | |
return resp.json() | |
except Exception as e: | |
print(resp.text) | |
raise e | |
def parse_comment(comment, reply_to=None): | |
if 'body' in comment: | |
replies = comment.get('replies', []) | |
replies = comment.get('children') if replies else [] | |
author = comment.get('author', 'no-author') | |
if replies: | |
replies = [parse_comment(reply['data'], author) for reply in replies] | |
if author != '[deleted]' and reply_to != '[deleted]': | |
DATA.append([author, reply_to, comment.get('body',None)]) | |
def parse_post(post): | |
content = get('https://www.reddit.com' + post['permalink'] + '.json') | |
comments = content[1]['data']['children'] | |
comments = [parse_comment(comment['data'], post['author']) for comment in comments] | |
self_post = content[0]['data']['children'][0]['data'].get('body') | |
count = 0 | |
last_post_id = '' | |
while True: | |
resp = get('https://www.reddit.com/r/django/top/.json', { | |
'count': count, | |
'after': last_post_id, | |
'sort': 'top', | |
't': 'all' | |
}) | |
posts = resp['data']['children'] | |
for post in posts: | |
post = post['data'] | |
print(post['title']) | |
parse_post(post) | |
count += 1 | |
last_post_id = resp['data']['after'] | |
print('COUNT:', count, last_post_id) | |
print(len(DATA)) | |
if count > 100: | |
break | |
writer = csv.writer(open('reddit.csv', 'w')) | |
for row in DATA: | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
First trial, an error:
UnicodeEncodeError: 'charmap' codec can't encode characters in position 25-26: character maps to
cf line 34, add encoding utf-8