Skip to content

Instantly share code, notes, and snippets.

@Romern
Created January 30, 2021 16:26
Show Gist options
  • Save Romern/7f66a0fa2f84f2f1dcea48949ba413d1 to your computer and use it in GitHub Desktop.
Save Romern/7f66a0fa2f84f2f1dcea48949ba413d1 to your computer and use it in GitHub Desktop.
Scrape all submissions of a subreddit via pushshift.io
import sys
import json
import requests
import traceback
if len(sys.argv)<4:
print("Scrape whole subreddit's submissions.")
print(f"Usage: {sys.argv[0]} SUBREDDIT BEFORETIMESTAMP OUTPUTFILE")
exit()
SUBREDDIT = sys.argv[1]
BEFORE = sys.argv[2]
SINK = sys.argv[3]
RETRY = 5
i = 0
with open(SINK, "a") as sink:
while True:
params = {
"subreddit": SUBREDDIT,
"before": BEFORE
}
it = 0
while it == 0 or not scrape.ok:
if it > 0:
print("Got an error, retrying...")
scrape = requests.get(f'https://api.pushshift.io/reddit/submission/search', params=params)
if it == RETRY:
break
it += 1
if not scrape.ok:
print(f"FAILED: {scrape.text}")
break
cur_data = scrape.json()['data']
for s in cur_data:
sink.write(f"{json.dumps(s)}\n")
BEFORE = cur_data[-1]["created_utc"]
i += 1
print(f"{i}: Crawled until {BEFORE}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment