Skip to content

Instantly share code, notes, and snippets.

@handyman5
Created February 17, 2012 01:07
Show Gist options
  • Save handyman5/1849444 to your computer and use it in GitHub Desktop.
Save handyman5/1849444 to your computer and use it in GitHub Desktop.
Daily Show downloader
#!/usr/bin/python
#
# Notes
# -----
# This script depends on rtmpdump and mythnettv; specify their paths in the global variables below.
# If you don't need mythnettv support, leave it undefined (MYTHNETTV="") and the script will skip that step
# svn update -r275 # rtmpdump
#
import re,threading
from os import stat, popen
from sys import exit
from urllib import urlopen
from optparse import OptionParser
from lxml import etree
from re import escape
import logging
GEN_URL="http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?uri=mgid:cms:episode:thedailyshow.com:%d"
MYTHNETTV="mythnettv"
RTMPDUMP="rtmpdump"
RTMPDUMP_OPTS="-W http://media.mtvnservices.com/player/release/?v=4.4.6 -q"
def get_media_id(page):
matches = re.findall('mgid:cms:[^:]+:comedycentral.com:[0-9]+', page)
logging.debug('get_media_id:: matches[0] = %s' % matches[0])
media_id = int(matches[0].split(':')[-1])
logging.info('get_media_id:: media_id = %d' % media_id)
return media_id
def get_metadata(page):
html = etree.HTML(page)
head = html.find('head')
body = html.find('body')
title = body.xpath("//div[@class='showName']")[0].text.strip()
subtitle = body.xpath("//h1[@class='subTitle']")[0].text.strip()
description = head.xpath("//meta[@name='description']")[0].values()[1]
logging.debug('get_metadata:: %s: %s: %s' % (title, subtitle, description))
return (title, subtitle, description)
def get_episode_list():
urls = { 'The Daily Show': "http://www.thedailyshow.com/full-episodes", 'Colbert Report': "http://www.colbertnation.com/full-episodes/" }
episodes = []
for (title,url) in urls.items():
page_data = urlopen(url).read()
anchors = etree.HTML(page_data).xpath("//span[@class='date']/a")
for item in anchors:
logging.debug('get_episode_list:: item.text = %s, item.href = %s' % (item.text, item.attrib['href']))
episodes.append( ( title, item.text, item.attrib['href'] ) )
return episodes
def get_media_files(id_num):
output = urlopen(GEN_URL % id_num).read()
urls = re.findall('<src>(.*\.mp4)</src>', output)
logging.debug('get_media_files:: urls = %s' % '\n'.join(urls))
return urls
def merge_files(id_num, filenames):
cmd = "mencoder -really-quiet -oac mp3lame -ovc lavc -lavcopts vcodec=mpeg4:vbitrate=1800 -o output_%d.avi " % id_num
for x in filenames: cmd += " %s" % x
try:
stat("output_%d.avi" % id_num)
except OSError:
logging.debug('merge_files:: cmd = %s' % cmd)
popen(cmd)
return "output_%d.avi" % id_num
########################################
class Downloader(threading.Thread):
def __init__(self, url):
self.rtmp = url
self.output = ""
threading.Thread.__init__(self)
def download_rtmp_url(self):
filename = self.rtmp.split("/")[-1].replace("mp4", "flv")
cmd = RTMPDUMP + " " + RTMPDUMP_OPTS + " -o " + filename + " -r " + self.rtmp
logging.debug('download_rtmp_url:: cmd = %s' % cmd)
try:
stat(filename)
except OSError:
popen(cmd)
return filename
def run (self):
if self.rtmp.find('sixty') >= 0:
self.output = ""
return
if self.rtmp.find('sting') >= 0:
self.output = ""
return
logging.debug('download_rtmp_url:: output = %s' % self.rtmp)
self.output = self.download_rtmp_url()
########################################
if __name__ == '__main__':
usage = '''
%prog http://www.thedailyshow.com/full-episodes/date-and-guest
%prog http://www.colbertnation.com/full-episodes/date-and-title-of-the-episode
%prog http://media.mtvnservices.com/mgid:cms:item:comedycentral.com:{ID}
%prog http://media.mtvnservices.com/mgid:cms:video:comedycentral.com:{ID}
%prog http://media.mtvnservices.com/mgid:cms:fullepisode:comedycentral.com:{ID}
'''
parser = OptionParser(usage)
parser.add_option("-l", "--list", action="store_true", dest="list", default=False, help="show a list of this week's episodes to select from")
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="specify log level (info, warn, debug, etc.)")
(options, args) = parser.parse_args()
if len(args) == 0 and not options.list:
parser.print_help()
exit(1)
loglevel = getattr(logging, options.debug and "DEBUG" or "INFO".upper(), None)
logging.basicConfig(level=loglevel)
url = ""
if options.list:
episodes = get_episode_list()
index = 0
for episode in episodes:
print "%d: %s - %s" % (index, episode[0], episode[1])
index = index + 1
selected = raw_input('Enter an episode number: ')
url = episodes[int(selected)][2]
else:
url = args[0]
page_data = urlopen(url).read()
id_num = get_media_id(page_data)
(title, subtitle, description) = get_metadata(page_data)
threads = []
for x in range(1,5):
url = get_media_files(id_num + x)[-1]
d = Downloader(url)
threads.append(d)
d.start()
# urls = get_media_files(id_num)
# threads = []
# for url in urls:
# d = Downloader(url)
# threads.append(d)
# d.start()
results_new = []
for thread in threads:
thread.join()
output = thread.output
logging.debug('downloader output: %s' % thread.output)
results_new.append(output)
logging.debug('\n'.join(results_new))
logging.debug('Merging files for id %d' % id_num)
result = merge_files(id_num, results_new)
if MYTHNETTV != "":
cmd = MYTHNETTV + ' importlocal %s "%s" "%s" "%s"' % (result, title, subtitle, escape(description))
logging.debug('Importing into MythTV; command = %s' % cmd)
popen(cmd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment