m-sean · February 1, 2022 07:15 · olastor · Jun 4, 2021 · m-sean · Jun 4, 2021
diff --git a/subscraper.py b/subscraper.py
 """Scrape a specified subreddit for comments using the Pushshift API
 (writes JSON objects to disk)"""

 import json
 import requests
 from tqdm import tqdm
 from nltk import defaultdict
 from time import sleep

 SUB = "AskReddit"   # subreddit to scrape
 START = 0           # lower bound for scrape (0 = today, 1 = yesterday, etc.)
 INC = 1             # increment for scrape (1 = 1 day per scrape)
 STOP = 14           # upper bound for scrape (14 = 2 weeks of data total)
 NUM = 500           # total number of comments per scrape

 SINK = "reddit-data.json"

 def main():

    # define an iterator using Pushift API for the scrape
    def yield_request(sub, start, stop, inc, num):
        while start <= stop:
            try: 
                r = requests.get(f'https://api.pushshift.io/reddit/search/comment/?subreddit={sub}&size={num}&before={start}d&after={start+inc}d')
                yield r.json()
                start += inc
            except json.decoder.JSONDecodeError:
                sleep(10)

                
    
    # collect the comments
    data = []
    for scrape in tqdm(yield_request(SUB, START, STOP, INC, NUM)):
        print(len(scrape['data']))
        data.extend(scrape['data'])
    print(f"{len(data)} comments collected")
    
    #write to disk
    with open(SINK, "w") as sink:
        json.dump(data, sink)
    print(f"Comments written to {SINK}")

 if __name__ == "__main__":
    main()
	"""Scrape a specified subreddit for comments using the Pushshift API
	(writes JSON objects to disk)"""

	import json
	import requests
	from tqdm import tqdm
	from nltk import defaultdict
	from time import sleep

	SUB = "AskReddit" # subreddit to scrape
	START = 0 # lower bound for scrape (0 = today, 1 = yesterday, etc.)
	INC = 1 # increment for scrape (1 = 1 day per scrape)
	STOP = 14 # upper bound for scrape (14 = 2 weeks of data total)
	NUM = 500 # total number of comments per scrape

	SINK = "reddit-data.json"

	def main():

	# define an iterator using Pushift API for the scrape
	def yield_request(sub, start, stop, inc, num):
	while start <= stop:
	try:
	r = requests.get(f'https://api.pushshift.io/reddit/search/comment/?subreddit={sub}&size={num}&before={start}d&after={start+inc}d')
	yield r.json()
	start += inc
	except json.decoder.JSONDecodeError:
	sleep(10)



	# collect the comments
	data = []
	for scrape in tqdm(yield_request(SUB, START, STOP, INC, NUM)):
	print(len(scrape['data']))
	data.extend(scrape['data'])
	print(f"{len(data)} comments collected")

	#write to disk
	with open(SINK, "w") as sink:
	json.dump(data, sink)
	print(f"Comments written to {SINK}")

	if __name__ == "__main__":
	main()