Last active
January 8, 2021 19:43
-
-
Save dcalacci/a1c3dbc86eefb7b1d878e7ff8dcb300d to your computer and use it in GitHub Desktop.
pull URLs posted to a list of reddit threads for archival purposes. made for archiving media posted to a couple threads after the 1/6 insurrection attempt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, json, re, time, itertools, datetime, sys, os, subprocess | |
from random import randint | |
redditThreads = [ | |
"https://www.reddit.com/r/AccidentalRenaissance/comments/kryhzt/us_capitol_protests_megathread_please_post_all/", | |
"https://www.reddit.com/r/DataHoarder/comments/krx449/megathread_archiving_the_capitol_hill_riots/", | |
"https://www.reddit.com/r/news/comments/krvwkf/megathread_protrump_protesters_storm_us_capitol/", | |
"https://www.reddit.com/r/politics/comments/kryi79/megathread_us_capitol_locked_down_as_trump/", | |
"https://www.reddit.com/r/PublicFreakout/comments/khs5k2/happening_now_trump_supporters_trying_to_destroy/", | |
"https://www.reddit.com/r/news/comments/krue9q/capitol_police_order_evacuation_of_some_capitol/", | |
"https://www.reddit.com/r/Conservative/comments/krxl6t/for_those_of_you_comparing_these_protests_to/", | |
"https://www.reddit.com/r/PublicFreakout/comments/krx7yw/the_police_opened_the_gates_for_capitol_rioters/", | |
"https://www.reddit.com/r/news/comments/krzopk/megathread_part_2_trump_supporters_storm_us/", | |
"https://www.reddit.com/r/stupidpol/comments/kruuvf/trump_fedayeen_group_sperging_out_and_rioting_at/" | |
] | |
reddituser = 'dcalacci' | |
after = '' | |
data = list() | |
user_agent = 'Python.Find_links'+ ':v0.1 (by /u/' + reddituser + ')' | |
def pullurl(s): | |
# regex for urls generally | |
rx = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" | |
return re.findall(rx, s) | |
def parseUrls(s): | |
urls = pullurl(s) | |
urlStrings = [] | |
for u in urls: | |
# our regex splits into http, domain, and path | |
urlStrings.append("{}://{}{}".format(u[0], u[1], u[2])) | |
# ensure unique | |
return list(set(urlStrings)) | |
def find(key, dictionary): | |
if not isinstance(dictionary, dict): | |
return {} | |
for k, v in dictionary.items(): | |
if k == key: | |
yield v | |
elif isinstance(v, dict): | |
for result in find(key, v): | |
yield result | |
elif isinstance(v, list): | |
for d in v: | |
for result in find(key, d): | |
yield result | |
for thread in redditThreads: | |
print("processing thead {}".format(thread)) | |
headers = {'user-agent': user_agent} | |
res = requests.get(thread + 'comments.json', headers=headers) | |
res = res.json() | |
for json in res: | |
output = json['data']['children'] | |
bodies = [list(find('body', o)) for o in output] | |
#Do some nasty cleansing to find the data we need | |
#Set json starting point in URL | |
data = data + bodies | |
#Wait between 10 and 15 seconds to prevent reddit from blocking the script | |
time.sleep(10 + randint(0,5)) | |
data = list(itertools.chain(*data)) | |
allUrls = list(itertools.chain(*[parseUrls(d) for d in data])) | |
print('found {} total URLs. Pulling media...'.format(len(allUrls))) | |
curtime = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S') | |
urlfile = "data/allurls_{}.txt".format(curtime) | |
# write parsed urls to file | |
with open(urlfile, 'w') as f: | |
for url in allUrls: | |
f.write(url + "\n") | |
print("parsed URLs from pages. Downloading media...") | |
# use you-get to download all the media we can | |
if not os.path.exists('data/media'): | |
os.mkdir("data/media") | |
with open(urlfile, 'r') as f: | |
urls = [l.strip() for l in f.readlines()] | |
for u in urls: | |
subprocess.run(['you-get', '-a', '-o', 'data/media', u]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You might be able to do this more easily using PRAW. I've never used it, but I've heard of people using it. It's a package that interfaces with Reddit's API.