Skip to content

Instantly share code, notes, and snippets.

@m-sean
Last active February 1, 2022 07:15
Show Gist options
  • Save m-sean/a0c88af0e96b1740594903fdbdc9a680 to your computer and use it in GitHub Desktop.
Save m-sean/a0c88af0e96b1740594903fdbdc9a680 to your computer and use it in GitHub Desktop.
Subreddit scraper using the Pushshift API.
"""Scrape a specified subreddit for comments using the Pushshift API
(writes JSON objects to disk)"""
import json
import requests
from tqdm import tqdm
from nltk import defaultdict
from time import sleep
SUB = "AskReddit" # subreddit to scrape
START = 0 # lower bound for scrape (0 = today, 1 = yesterday, etc.)
INC = 1 # increment for scrape (1 = 1 day per scrape)
STOP = 14 # upper bound for scrape (14 = 2 weeks of data total)
NUM = 500 # total number of comments per scrape
SINK = "reddit-data.json"
def main():
# define an iterator using Pushift API for the scrape
def yield_request(sub, start, stop, inc, num):
while start <= stop:
try:
r = requests.get(f'https://api.pushshift.io/reddit/search/comment/?subreddit={sub}&size={num}&before={start}d&after={start+inc}d')
yield r.json()
start += inc
except json.decoder.JSONDecodeError:
sleep(10)
# collect the comments
data = []
for scrape in tqdm(yield_request(SUB, START, STOP, INC, NUM)):
print(len(scrape['data']))
data.extend(scrape['data'])
print(f"{len(data)} comments collected")
#write to disk
with open(SINK, "w") as sink:
json.dump(data, sink)
print(f"Comments written to {SINK}")
if __name__ == "__main__":
main()
@olastor
Copy link

olastor commented Jun 4, 2021

Thanks for publishing your script! One question: Why does STOP=14 mean one week and not two?

@m-sean
Copy link
Author

m-sean commented Jun 4, 2021

@olastor, thanks for catching, this was an error—it is in fact 2 weeks

@Anil-gitub
Copy link

how to get this data to csv

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment