Skip to content

Instantly share code, notes, and snippets.

@mdamien
Last active July 27, 2019 06:46
Show Gist options
  • Save mdamien/36d05bc75423bc2c88ac1eb24b8a11f2 to your computer and use it in GitHub Desktop.
Save mdamien/36d05bc75423bc2c88ac1eb24b8a11f2 to your computer and use it in GitHub Desktop.
parse_reddit.py
import requests, csv
# optional: makes requests-cache make it easy to cache your results
# import requests_cache
# requests_cache.install_cache('reddit_cache')
# where we store all the rows
DATA = []
def get(url, args={}):
print('GET', url, args)
try:
resp = requests.get(url, args, headers={'User-agent': 'linkage.fr'})
return resp.json()
except Exception as e:
print(resp.text)
raise e
def parse_comment(comment, reply_to=None):
if 'body' in comment:
replies = comment.get('replies', [])
replies = comment.get('children') if replies else []
author = comment.get('author', 'no-author')
if replies:
replies = [parse_comment(reply['data'], author) for reply in replies]
if author != '[deleted]' and reply_to != '[deleted]':
DATA.append([author, reply_to, comment.get('body',None)])
def parse_post(post):
content = get('https://www.reddit.com' + post['permalink'] + '.json')
comments = content[1]['data']['children']
comments = [parse_comment(comment['data'], post['author']) for comment in comments]
self_post = content[0]['data']['children'][0]['data'].get('body')
count = 0
last_post_id = ''
while True:
resp = get('https://www.reddit.com/r/django/top/.json', {
'count': count,
'after': last_post_id,
'sort': 'top',
't': 'all'
})
posts = resp['data']['children']
for post in posts:
post = post['data']
print(post['title'])
parse_post(post)
count += 1
last_post_id = resp['data']['after']
print('COUNT:', count, last_post_id)
print(len(DATA))
if count > 100:
break
writer = csv.writer(open('reddit.csv', 'w'))
for row in DATA:
writer.writerow(row)
@alainfa
Copy link

alainfa commented Jul 27, 2019

First trial, an error:
UnicodeEncodeError: 'charmap' codec can't encode characters in position 25-26: character maps to
cf line 34, add encoding utf-8

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment