Skip to content

Instantly share code, notes, and snippets.

@brwnj
Last active August 29, 2015 14:05
Show Gist options
  • Save brwnj/650a00da6b8141e71e70 to your computer and use it in GitHub Desktop.
Save brwnj/650a00da6b8141e71e70 to your computer and use it in GitHub Desktop.
Download the best resolution of the top <limit> video from subreddit 'videos' to <out> directory.
#!/usr/bin/env python
# coding=utf-8
"""
Download the best resolution of the top <limit> video from subreddit 'videos'
to <out> directory.
"""
import multiprocessing
import os
import pafy
import praw
import sys
import time
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
USERAGENT = "yt_download"
def downloader((video, title, out), callback=None):
sys.stderr.write("Downloading %s\n" % title)
video.download(quiet=True, filepath=out)
def main(out, limit, pool, debug):
try:
os.mkdir(out)
except OSError:
pass
r = praw.Reddit(user_agent=USERAGENT)
p = multiprocessing.Pool(pool)
# list of tuples (pafy_obj, video_title, filename)
videos = []
# track duplicate links on reddit
seen = set()
sys.stderr.write("Retrieving URLs from Reddit\n")
for sub in r.get_subreddit("videos").get_hot(limit=limit):
if debug:
print "processing %s" % sub.title
if 'youtu' not in sub.url:
if debug:
print " >> non-youtube"
continue
if 'attribution_link' in sub.url:
watch_id = sub.url.split("watch%3Fv%3D")[1].split("%")[0]
sub.url = "http://www.youtube.com/watch?v=%s" % watch_id
try:
best = pafy.new(sub.url).getbest()
title = ''.join(x for x in best.title.replace(" ", "_") if x.isalnum() or x == "_")
if debug:
print " >> new title from youtube: %s" % title
fn = out + "/" + title + "." + best.extension
if not os.path.exists(fn):
if not fn in seen:
if debug:
print " >> url added to queue"
videos.append((best, title, fn))
seen.add(fn)
else:
if debug:
print " >> duplicate link"
else:
if debug:
print " >> video exists in %s" % out
except IOError:
# video no longer available on youtube
if debug:
print " >> failed to retrieve youtube link"
pass
p.map(downloader, videos)
if __name__ == '__main__':
p = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter)
p.add_argument('-o', '--out', default=time.strftime("%Y%m%d"), help='output directory')
p.add_argument('-l', '--limit', type=int, default=50, help='number of reddit links to download (not all will be youtube videos)')
p.add_argument('-p', '--pool', type=int, default=20, help='simultaneous downloads')
p.add_argument('--debug', action='store_true')
args = p.parse_args()
main(args.out, args.limit, args.pool, args.debug)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment