Skip to content

Instantly share code, notes, and snippets.

@dcalacci
Last active January 8, 2021 19:43
Show Gist options
  • Save dcalacci/a1c3dbc86eefb7b1d878e7ff8dcb300d to your computer and use it in GitHub Desktop.
Save dcalacci/a1c3dbc86eefb7b1d878e7ff8dcb300d to your computer and use it in GitHub Desktop.
pull URLs posted to a list of reddit threads for archival purposes. made for archiving media posted to a couple threads after the 1/6 insurrection attempt
import requests, json, re, time, itertools, datetime, sys, os, subprocess
from random import randint
redditThreads = [
"https://www.reddit.com/r/AccidentalRenaissance/comments/kryhzt/us_capitol_protests_megathread_please_post_all/",
"https://www.reddit.com/r/DataHoarder/comments/krx449/megathread_archiving_the_capitol_hill_riots/",
"https://www.reddit.com/r/news/comments/krvwkf/megathread_protrump_protesters_storm_us_capitol/",
"https://www.reddit.com/r/politics/comments/kryi79/megathread_us_capitol_locked_down_as_trump/",
"https://www.reddit.com/r/PublicFreakout/comments/khs5k2/happening_now_trump_supporters_trying_to_destroy/",
"https://www.reddit.com/r/news/comments/krue9q/capitol_police_order_evacuation_of_some_capitol/",
"https://www.reddit.com/r/Conservative/comments/krxl6t/for_those_of_you_comparing_these_protests_to/",
"https://www.reddit.com/r/PublicFreakout/comments/krx7yw/the_police_opened_the_gates_for_capitol_rioters/",
"https://www.reddit.com/r/news/comments/krzopk/megathread_part_2_trump_supporters_storm_us/",
"https://www.reddit.com/r/stupidpol/comments/kruuvf/trump_fedayeen_group_sperging_out_and_rioting_at/"
]
reddituser = 'dcalacci'
after = ''
data = list()
user_agent = 'Python.Find_links'+ ':v0.1 (by /u/' + reddituser + ')'
def pullurl(s):
# regex for urls generally
rx = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
return re.findall(rx, s)
def parseUrls(s):
urls = pullurl(s)
urlStrings = []
for u in urls:
# our regex splits into http, domain, and path
urlStrings.append("{}://{}{}".format(u[0], u[1], u[2]))
# ensure unique
return list(set(urlStrings))
def find(key, dictionary):
if not isinstance(dictionary, dict):
return {}
for k, v in dictionary.items():
if k == key:
yield v
elif isinstance(v, dict):
for result in find(key, v):
yield result
elif isinstance(v, list):
for d in v:
for result in find(key, d):
yield result
for thread in redditThreads:
print("processing thead {}".format(thread))
headers = {'user-agent': user_agent}
res = requests.get(thread + 'comments.json', headers=headers)
res = res.json()
for json in res:
output = json['data']['children']
bodies = [list(find('body', o)) for o in output]
#Do some nasty cleansing to find the data we need
#Set json starting point in URL
data = data + bodies
#Wait between 10 and 15 seconds to prevent reddit from blocking the script
time.sleep(10 + randint(0,5))
data = list(itertools.chain(*data))
allUrls = list(itertools.chain(*[parseUrls(d) for d in data]))
print('found {} total URLs. Pulling media...'.format(len(allUrls)))
curtime = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S')
urlfile = "data/allurls_{}.txt".format(curtime)
# write parsed urls to file
with open(urlfile, 'w') as f:
for url in allUrls:
f.write(url + "\n")
print("parsed URLs from pages. Downloading media...")
# use you-get to download all the media we can
if not os.path.exists('data/media'):
os.mkdir("data/media")
with open(urlfile, 'r') as f:
urls = [l.strip() for l in f.readlines()]
for u in urls:
subprocess.run(['you-get', '-a', '-o', 'data/media', u])
@pashri
Copy link

pashri commented Jan 6, 2021

You might be able to do this more easily using PRAW. I've never used it, but I've heard of people using it. It's a package that interfaces with Reddit's API.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment