Last active
June 5, 2021 01:44
-
-
Save Aareon/bde0049880718a98934822c5702a6a1c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python3 | |
import sys | |
import time | |
from pathlib import Path | |
import requests | |
from pyrate_limiter import Duration, Limiter, RequestRate | |
try: | |
import ujson as json | |
except ImportError: | |
import json | |
HERE = Path(__file__).parent | |
CACHE_DIR = HERE / "_cache" | |
CACHE_MAX_AGE = 5 * 60 # cache lasts 5 minutes | |
USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36" | |
# limit 1 requests per 5 minutes | |
limiter = Limiter(RequestRate(5, Duration.MINUTE)) | |
n_limit = 1 | |
@limiter.ratelimit('top', delay=True, max_delay=10) | |
def get_top_json(sub, limit, when, session=None): | |
if session is None: | |
session = requests.session() | |
session.headers['User-Agent'] = USERAGENT | |
url = f"https://reddit.com/r/{sub}/top.json?t={when}&limit={limit}" | |
print(f"Getting {url}") | |
r = session.get(url) | |
return r | |
def help_usage(): | |
return (f'usage: {Path(__file__).name} {{sub}} {{count}} {{when}}\n\n' | |
' sub the subreddit to scrape\n' | |
' limit number of posts to gather\n' | |
' when "hour", "day", "week", "month", "year", "all"\n\n' | |
f'Example: {Path(__file__).name} thenewboston 10 week') | |
def overwrite_file(fp, text): | |
try: | |
with open(fp, 'w+', encoding='utf-8') as f: | |
f.seek(0) | |
f.write(text) | |
f.truncate() | |
return True | |
except Exception as e: | |
raise e | |
def main(): | |
if len(sys.argv) == 1: | |
print('Missing arguments') | |
print(help_usage()) | |
sys.exit(1) | |
args = sys.argv[1:] | |
try: | |
sub = args[0] | |
limit = args[1] | |
when = args[2] | |
except IndexError: | |
print('Missing arguments') | |
print(help_usage()) | |
sys.exit(1) | |
# check cache exists | |
if not CACHE_DIR.exists(): | |
CACHE_DIR.mkdir(parents=True) | |
# check if cache is recent | |
curr_time = time.time() | |
json_fp = CACHE_DIR / f'{sub}-top-{limit}-{when}.json' | |
if json_fp.exists(): | |
ft = json_fp.stat().st_mtime | |
if curr_time - ft >= CACHE_MAX_AGE: | |
print("Cache is old") | |
cache_valid = False | |
else: | |
print(f"Using cached result: {json_fp}") | |
cache_valid = True | |
else: | |
print("No cache found") | |
cache_valid = False | |
if not cache_valid: | |
r = get_top_json(sub, limit, when) | |
if r.status_code != 200: | |
print("Failed to download JSON. Got status code {r.status_code}") | |
sys.exit(1) | |
print("Downloaded!") | |
overwrite_file(json_fp, r.text) | |
data = r.text | |
else: | |
with open(json_fp) as f: | |
data = f.read() | |
data = json.loads(data) | |
print() | |
posts = [] | |
for post in data['data']['children']: | |
post = post['data'] | |
title = post['title'] | |
author = post['author'] | |
ups = post['ups'] | |
created_utc = post['created'] | |
posts.append({ | |
'title': title, | |
'author': author, | |
'upvotes': ups, | |
'created': created_utc | |
}) | |
print(f"Title: {title}") | |
print(f"Author: {author}") | |
print(f"Karma: {ups}") | |
print(f"Created UTC: {created_utc}") | |
print() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment