Aareon · June 5, 2021 01:44
diff --git a/scrape-reddit-top-posts.py b/scrape-reddit-top-posts.py
 #!python3
 import sys
 import time
 from pathlib import Path

 import requests
 from pyrate_limiter import Duration, Limiter, RequestRate

 try:
    import ujson as json
 except ImportError:
    import json

 HERE = Path(__file__).parent
 CACHE_DIR = HERE / "_cache"
 CACHE_MAX_AGE = 5 * 60  # cache lasts 5 minutes
 USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"

 # limit 1 requests per 5 minutes
 limiter = Limiter(RequestRate(5, Duration.MINUTE))
 n_limit = 1


 @limiter.ratelimit('top', delay=True, max_delay=10)
 def get_top_json(sub, limit, when, session=None):
    if session is None:
        session = requests.session()
        session.headers['User-Agent'] = USERAGENT
    url = f"https://reddit.com/r/{sub}/top.json?t={when}&limit={limit}"
    print(f"Getting {url}")
    r = session.get(url)
    return r


 def help_usage():
    return (f'usage: {Path(__file__).name} {{sub}} {{count}} {{when}}\n\n'
            '  sub        the subreddit to scrape\n'
            '  limit      number of posts to gather\n'
            '  when       "hour", "day", "week", "month", "year", "all"\n\n'
            f'Example: {Path(__file__).name} thenewboston 10 week')


 def overwrite_file(fp, text):
    try:
        with open(fp, 'w+', encoding='utf-8') as f:
            f.seek(0)
            f.write(text)
            f.truncate()
            return True
    except Exception as e:
        raise e


 def main():
    if len(sys.argv) == 1:
        print('Missing arguments')
        print(help_usage())
        sys.exit(1)
    args = sys.argv[1:]

    try:
        sub = args[0]
        limit = args[1]
        when = args[2]
    except IndexError:
        print('Missing arguments')
        print(help_usage())
        sys.exit(1)

    # check cache exists
    if not CACHE_DIR.exists():
        CACHE_DIR.mkdir(parents=True)

    # check if cache is recent
    curr_time = time.time()
    json_fp = CACHE_DIR / f'{sub}-top-{limit}-{when}.json'
    if json_fp.exists():
        ft = json_fp.stat().st_mtime
        if curr_time - ft >= CACHE_MAX_AGE:
            print("Cache is old")
            cache_valid = False
        else:
            print(f"Using cached result: {json_fp}")
            cache_valid = True
    else:
        print("No cache found")
        cache_valid = False

    if not cache_valid:
        r = get_top_json(sub, limit, when)
        if r.status_code != 200:
            print("Failed to download JSON. Got status code {r.status_code}")
            sys.exit(1)
        print("Downloaded!")
        overwrite_file(json_fp, r.text)
        data = r.text
    else:
        with open(json_fp) as f:
            data = f.read()

    data = json.loads(data)
    print()

    posts = []
    for post in data['data']['children']:
        post = post['data']
        title = post['title']
        author = post['author']
        ups = post['ups']
        created_utc = post['created']
        posts.append({
            'title': title,
            'author': author,
            'upvotes': ups,
            'created': created_utc
        })
        print(f"Title: {title}")
        print(f"Author: {author}")
        print(f"Karma: {ups}")
        print(f"Created UTC: {created_utc}")
        print()


 if __name__ == "__main__":
    main()
	#!python3
	import sys
	import time
	from pathlib import Path

	import requests
	from pyrate_limiter import Duration, Limiter, RequestRate

	try:
	import ujson as json
	except ImportError:
	import json

	HERE = Path(__file__).parent
	CACHE_DIR = HERE / "_cache"
	CACHE_MAX_AGE = 5 * 60 # cache lasts 5 minutes
	USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"

	# limit 1 requests per 5 minutes
	limiter = Limiter(RequestRate(5, Duration.MINUTE))
	n_limit = 1


	@limiter.ratelimit('top', delay=True, max_delay=10)
	def get_top_json(sub, limit, when, session=None):
	if session is None:
	session = requests.session()
	session.headers['User-Agent'] = USERAGENT
	url = f"https://reddit.com/r/{sub}/top.json?t={when}&limit={limit}"
	print(f"Getting {url}")
	r = session.get(url)
	return r


	def help_usage():
	return (f'usage: {Path(__file__).name} {{sub}} {{count}} {{when}}\n\n'
	' sub the subreddit to scrape\n'
	' limit number of posts to gather\n'
	' when "hour", "day", "week", "month", "year", "all"\n\n'
	f'Example: {Path(__file__).name} thenewboston 10 week')


	def overwrite_file(fp, text):
	try:
	with open(fp, 'w+', encoding='utf-8') as f:
	f.seek(0)
	f.write(text)
	f.truncate()
	return True
	except Exception as e:
	raise e


	def main():
	if len(sys.argv) == 1:
	print('Missing arguments')
	print(help_usage())
	sys.exit(1)
	args = sys.argv[1:]

	try:
	sub = args[0]
	limit = args[1]
	when = args[2]
	except IndexError:
	print('Missing arguments')
	print(help_usage())
	sys.exit(1)

	# check cache exists
	if not CACHE_DIR.exists():
	CACHE_DIR.mkdir(parents=True)

	# check if cache is recent
	curr_time = time.time()
	json_fp = CACHE_DIR / f'{sub}-top-{limit}-{when}.json'
	if json_fp.exists():
	ft = json_fp.stat().st_mtime
	if curr_time - ft >= CACHE_MAX_AGE:
	print("Cache is old")
	cache_valid = False
	else:
	print(f"Using cached result: {json_fp}")
	cache_valid = True
	else:
	print("No cache found")
	cache_valid = False

	if not cache_valid:
	r = get_top_json(sub, limit, when)
	if r.status_code != 200:
	print("Failed to download JSON. Got status code {r.status_code}")
	sys.exit(1)
	print("Downloaded!")
	overwrite_file(json_fp, r.text)
	data = r.text
	else:
	with open(json_fp) as f:
	data = f.read()

	data = json.loads(data)
	print()

	posts = []
	for post in data['data']['children']:
	post = post['data']
	title = post['title']
	author = post['author']
	ups = post['ups']
	created_utc = post['created']
	posts.append({
	'title': title,
	'author': author,
	'upvotes': ups,
	'created': created_utc
	})
	print(f"Title: {title}")
	print(f"Author: {author}")
	print(f"Karma: {ups}")
	print(f"Created UTC: {created_utc}")
	print()


	if __name__ == "__main__":
	main()