Created
October 29, 2020 21:15
-
-
Save ameerkat/ccbc9077b2bae8ff8f9d77790d74a149 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted from https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/ | |
import requests | |
import json | |
import re | |
import time | |
import os | |
SUBREDDIT = "movies" | |
PUSHSHIFT_REDDIT_URL = "http://api.pushshift.io/reddit" | |
DATA_FOLDER = "./r-" + SUBREDDIT | |
AFTER_EPOCH = 1546300800 # 01/01/2019 @ 12:00am (UTC), see https://www.epochconverter.com/ | |
def fetchObjects(**kwargs): | |
# Default paramaters for API query | |
params = { | |
"sort_type":"created_utc", | |
"sort":"asc", | |
"size":1000 | |
} | |
# Add additional paramters based on function arguments | |
for key,value in kwargs.items(): | |
params[key] = value | |
# Set the type variable based on function input | |
# The type can be "comment" or "submission", default is "comment" | |
type = "comment" | |
if 'type' in kwargs and kwargs['type'].lower() == "submission": | |
type = "submission" | |
# Perform an API request | |
r = requests.get(PUSHSHIFT_REDDIT_URL + "/" + type + "/search/", params=params, timeout=30) | |
# Check the status code, if successful, process the data | |
if r.status_code == 200: | |
response = json.loads(r.text) | |
data = response['data'] | |
sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36)) | |
return sorted_data_by_id | |
def extract_reddit_data(**kwargs): | |
# Specify the start timestamp | |
max_created_utc = AFTER_EPOCH | |
max_id = 0 | |
# Open a file for JSON output | |
file = open(os.path.join(DATA_FOLDER, "submissions.json"),"a") | |
# While loop for recursive function | |
while True: | |
nothing_processed = True | |
# Call the recursive function | |
objects = fetchObjects(**kwargs,after=max_created_utc) | |
# Loop the returned data, ordered by date | |
for object in objects: | |
id = int(object['id'],36) | |
if id > max_id: | |
nothing_processed = False | |
created_utc = object['created_utc'] | |
max_id = id | |
if created_utc > max_created_utc: max_created_utc = created_utc | |
# Output JSON data to the opened file | |
print(json.dumps(object,sort_keys=True,ensure_ascii=True),file=file) | |
# Exit if nothing happened | |
if nothing_processed: return | |
max_created_utc -= 1 | |
# Sleep a little before the next recursive function call | |
time.sleep(.5) | |
os.makedirs(DATA_FOLDER, exist_ok=True) | |
extract_reddit_data(subreddit=SUBREDDIT,type="submission") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment