Created
January 30, 2021 16:26
-
-
Save Romern/7f66a0fa2f84f2f1dcea48949ba413d1 to your computer and use it in GitHub Desktop.
Scrape all submissions of a subreddit via pushshift.io
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import requests | |
import traceback | |
if len(sys.argv)<4: | |
print("Scrape whole subreddit's submissions.") | |
print(f"Usage: {sys.argv[0]} SUBREDDIT BEFORETIMESTAMP OUTPUTFILE") | |
exit() | |
SUBREDDIT = sys.argv[1] | |
BEFORE = sys.argv[2] | |
SINK = sys.argv[3] | |
RETRY = 5 | |
i = 0 | |
with open(SINK, "a") as sink: | |
while True: | |
params = { | |
"subreddit": SUBREDDIT, | |
"before": BEFORE | |
} | |
it = 0 | |
while it == 0 or not scrape.ok: | |
if it > 0: | |
print("Got an error, retrying...") | |
scrape = requests.get(f'https://api.pushshift.io/reddit/submission/search', params=params) | |
if it == RETRY: | |
break | |
it += 1 | |
if not scrape.ok: | |
print(f"FAILED: {scrape.text}") | |
break | |
cur_data = scrape.json()['data'] | |
for s in cur_data: | |
sink.write(f"{json.dumps(s)}\n") | |
BEFORE = cur_data[-1]["created_utc"] | |
i += 1 | |
print(f"{i}: Crawled until {BEFORE}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment