Last active
August 29, 2015 14:05
-
-
Save brwnj/650a00da6b8141e71e70 to your computer and use it in GitHub Desktop.
Download the best resolution of the top <limit> video from subreddit 'videos' to <out> directory.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
""" | |
Download the best resolution of the top <limit> video from subreddit 'videos' | |
to <out> directory. | |
""" | |
import multiprocessing | |
import os | |
import pafy | |
import praw | |
import sys | |
import time | |
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |
USERAGENT = "yt_download" | |
def downloader((video, title, out), callback=None): | |
sys.stderr.write("Downloading %s\n" % title) | |
video.download(quiet=True, filepath=out) | |
def main(out, limit, pool, debug): | |
try: | |
os.mkdir(out) | |
except OSError: | |
pass | |
r = praw.Reddit(user_agent=USERAGENT) | |
p = multiprocessing.Pool(pool) | |
# list of tuples (pafy_obj, video_title, filename) | |
videos = [] | |
# track duplicate links on reddit | |
seen = set() | |
sys.stderr.write("Retrieving URLs from Reddit\n") | |
for sub in r.get_subreddit("videos").get_hot(limit=limit): | |
if debug: | |
print "processing %s" % sub.title | |
if 'youtu' not in sub.url: | |
if debug: | |
print " >> non-youtube" | |
continue | |
if 'attribution_link' in sub.url: | |
watch_id = sub.url.split("watch%3Fv%3D")[1].split("%")[0] | |
sub.url = "http://www.youtube.com/watch?v=%s" % watch_id | |
try: | |
best = pafy.new(sub.url).getbest() | |
title = ''.join(x for x in best.title.replace(" ", "_") if x.isalnum() or x == "_") | |
if debug: | |
print " >> new title from youtube: %s" % title | |
fn = out + "/" + title + "." + best.extension | |
if not os.path.exists(fn): | |
if not fn in seen: | |
if debug: | |
print " >> url added to queue" | |
videos.append((best, title, fn)) | |
seen.add(fn) | |
else: | |
if debug: | |
print " >> duplicate link" | |
else: | |
if debug: | |
print " >> video exists in %s" % out | |
except IOError: | |
# video no longer available on youtube | |
if debug: | |
print " >> failed to retrieve youtube link" | |
pass | |
p.map(downloader, videos) | |
if __name__ == '__main__': | |
p = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) | |
p.add_argument('-o', '--out', default=time.strftime("%Y%m%d"), help='output directory') | |
p.add_argument('-l', '--limit', type=int, default=50, help='number of reddit links to download (not all will be youtube videos)') | |
p.add_argument('-p', '--pool', type=int, default=20, help='simultaneous downloads') | |
p.add_argument('--debug', action='store_true') | |
args = p.parse_args() | |
main(args.out, args.limit, args.pool, args.debug) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment