dcalacci · January 8, 2021 19:43 · pashri · Jan 6, 2021
diff --git a/datahoarder.py b/datahoarder.py
 import requests, json, re, time, itertools, datetime, sys, os, subprocess
 from random import randint

 redditThreads = [
        "https://www.reddit.com/r/AccidentalRenaissance/comments/kryhzt/us_capitol_protests_megathread_please_post_all/",
        "https://www.reddit.com/r/DataHoarder/comments/krx449/megathread_archiving_the_capitol_hill_riots/",
        "https://www.reddit.com/r/news/comments/krvwkf/megathread_protrump_protesters_storm_us_capitol/",
        "https://www.reddit.com/r/politics/comments/kryi79/megathread_us_capitol_locked_down_as_trump/",
        "https://www.reddit.com/r/PublicFreakout/comments/khs5k2/happening_now_trump_supporters_trying_to_destroy/",
        "https://www.reddit.com/r/news/comments/krue9q/capitol_police_order_evacuation_of_some_capitol/",
        "https://www.reddit.com/r/Conservative/comments/krxl6t/for_those_of_you_comparing_these_protests_to/",
        "https://www.reddit.com/r/PublicFreakout/comments/krx7yw/the_police_opened_the_gates_for_capitol_rioters/",
        "https://www.reddit.com/r/news/comments/krzopk/megathread_part_2_trump_supporters_storm_us/",
        "https://www.reddit.com/r/stupidpol/comments/kruuvf/trump_fedayeen_group_sperging_out_and_rioting_at/"
        ]
 reddituser = 'dcalacci'
 after = ''
 data = list()
 user_agent = 'Python.Find_links'+ ':v0.1 (by /u/' + reddituser + ')'


 def pullurl(s):
    # regex for urls generally
    rx = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
    return re.findall(rx, s)

 def parseUrls(s):
    urls = pullurl(s)
    urlStrings = []
    for u in urls:
        # our regex splits into http, domain, and path
        urlStrings.append("{}://{}{}".format(u[0], u[1], u[2]))

    # ensure unique
    return list(set(urlStrings))

 def find(key, dictionary):
    if not isinstance(dictionary, dict):
        return {}
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result


 for thread in redditThreads:
    print("processing thead {}".format(thread))
    headers = {'user-agent': user_agent}

    res = requests.get(thread + 'comments.json', headers=headers)
    res = res.json()

    for json in res:
        output = json['data']['children']
        bodies = [list(find('body', o)) for o in output]
     
        #Do some nasty cleansing to find the data we need
        #Set json starting point in URL
        data = data + bodies
     
    #Wait between 10 and 15 seconds to prevent reddit from blocking the script
    time.sleep(10 + randint(0,5))

 data = list(itertools.chain(*data))
 allUrls = list(itertools.chain(*[parseUrls(d) for d in data]))
 print('found {} total URLs. Pulling media...'.format(len(allUrls)))

 curtime = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S')

 urlfile = "data/allurls_{}.txt".format(curtime)
 # write parsed urls to file
 with open(urlfile, 'w') as f:
    for url in allUrls:
        f.write(url + "\n")

 print("parsed URLs from pages. Downloading media...")

 # use you-get to download all the media we can
 if not os.path.exists('data/media'):
    os.mkdir("data/media")
 with open(urlfile, 'r') as f:
    urls = [l.strip() for l in f.readlines()]
    for u in urls:
        subprocess.run(['you-get', '-a', '-o', 'data/media', u])
	import requests, json, re, time, itertools, datetime, sys, os, subprocess
	from random import randint

	redditThreads = [
	"https://www.reddit.com/r/AccidentalRenaissance/comments/kryhzt/us_capitol_protests_megathread_please_post_all/",
	"https://www.reddit.com/r/DataHoarder/comments/krx449/megathread_archiving_the_capitol_hill_riots/",
	"https://www.reddit.com/r/news/comments/krvwkf/megathread_protrump_protesters_storm_us_capitol/",
	"https://www.reddit.com/r/politics/comments/kryi79/megathread_us_capitol_locked_down_as_trump/",
	"https://www.reddit.com/r/PublicFreakout/comments/khs5k2/happening_now_trump_supporters_trying_to_destroy/",
	"https://www.reddit.com/r/news/comments/krue9q/capitol_police_order_evacuation_of_some_capitol/",
	"https://www.reddit.com/r/Conservative/comments/krxl6t/for_those_of_you_comparing_these_protests_to/",
	"https://www.reddit.com/r/PublicFreakout/comments/krx7yw/the_police_opened_the_gates_for_capitol_rioters/",
	"https://www.reddit.com/r/news/comments/krzopk/megathread_part_2_trump_supporters_storm_us/",
	"https://www.reddit.com/r/stupidpol/comments/kruuvf/trump_fedayeen_group_sperging_out_and_rioting_at/"
	]
	reddituser = 'dcalacci'
	after = ''
	data = list()
	user_agent = 'Python.Find_links'+ ':v0.1 (by /u/' + reddituser + ')'


	def pullurl(s):
	# regex for urls generally
	rx = "(http\|ftp\|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
	return re.findall(rx, s)

	def parseUrls(s):
	urls = pullurl(s)
	urlStrings = []
	for u in urls:
	# our regex splits into http, domain, and path
	urlStrings.append("{}://{}{}".format(u[0], u[1], u[2]))

	# ensure unique
	return list(set(urlStrings))

	def find(key, dictionary):
	if not isinstance(dictionary, dict):
	return {}
	for k, v in dictionary.items():
	if k == key:
	yield v
	elif isinstance(v, dict):
	for result in find(key, v):
	yield result
	elif isinstance(v, list):
	for d in v:
	for result in find(key, d):
	yield result


	for thread in redditThreads:
	print("processing thead {}".format(thread))
	headers = {'user-agent': user_agent}

	res = requests.get(thread + 'comments.json', headers=headers)
	res = res.json()

	for json in res:
	output = json['data']['children']
	bodies = [list(find('body', o)) for o in output]

	#Do some nasty cleansing to find the data we need
	#Set json starting point in URL
	data = data + bodies

	#Wait between 10 and 15 seconds to prevent reddit from blocking the script
	time.sleep(10 + randint(0,5))

	data = list(itertools.chain(*data))
	allUrls = list(itertools.chain(*[parseUrls(d) for d in data]))
	print('found {} total URLs. Pulling media...'.format(len(allUrls)))

	curtime = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S')

	urlfile = "data/allurls_{}.txt".format(curtime)
	# write parsed urls to file
	with open(urlfile, 'w') as f:
	for url in allUrls:
	f.write(url + "\n")

	print("parsed URLs from pages. Downloading media...")

	# use you-get to download all the media we can
	if not os.path.exists('data/media'):
	os.mkdir("data/media")
	with open(urlfile, 'r') as f:
	urls = [l.strip() for l in f.readlines()]
	for u in urls:
	subprocess.run(['you-get', '-a', '-o', 'data/media', u])