Last active
August 29, 2015 14:01
-
-
Save danielvijge/e71a120313835a48d2c9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import _addoncompat | |
import _common | |
import _connection | |
import _main_viacom | |
import re | |
import sys | |
import urllib | |
import simplejson | |
from itertools import izip | |
from bs4 import BeautifulSoup, SoupStrainer | |
pluginHandle = int(sys.argv[1]) | |
SITE = 'comedy' | |
NAME = 'Comedy Central' | |
DESCRIPTION = "COMEDY CENTRAL, the #1 brand in comedy, is available to over 99 million viewers nationwide and is a top-rated network among men ages 18-24 and 18-34 and adults ages 18-49. With on-air, online and on-the-go mobile technology, COMEDY CENTRAL gives its audience access to the cutting-edge, laugh-out-loud world of comedy wherever they go. Hit series include Tosh.0, Workaholics, Futurama, Key & Peele, Ugly Americans and the Emmy' and Peabody' Award-winning series The Daily Show with Jon Stewart, The Colbert Report and South Park. COMEDY CENTRAL is also involved in producing nationwide stand-up tours, boasts its own record label and operates one of the most successful home entertainment divisions in the industry. COMEDY CENTRAL is owned by, and is a registered trademark of Comedy Partners, a wholly-owned unit of Viacom Inc. (NASDAQ: VIA and VIAB). For more information visit COMEDY CENTRAL's press Web site at www.cc.com/press or the network's consumer site at www.comedycentral.com and follow us on Twitter @ComedyCentralPR for the latest in breaking news updates, behind-the-scenes information and photos." | |
BASE = 'http://www.cc.com' | |
SHOWS = 'http://www.cc.com/shows' | |
VIDEOURL = 'http://media.mtvnservices.com/' | |
MP4URL = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639/44620/mtvnorigin' | |
def masterlist(): | |
"""Build a list of all shows. First get all the shows listed in the Full Episodes menu | |
Then read all shows from the page's microdata. Filter to make sure there are no duplicates""" | |
master_db = [] | |
root_doubles = [] | |
root_url = SHOWS | |
root_data = _connection.getURL(root_url) | |
root_tree = BeautifulSoup(root_data, 'html5lib') | |
root_menu = root_tree.find('div', class_ = 'full_episodes').find_all('a', href = re.compile('^http+')) | |
for root_item in root_menu: | |
root_name = root_item.string | |
if root_name.lower() not in root_doubles and root_name.split(' with ')[0].lower() not in root_doubles: | |
root_doubles.append(root_name.lower().split(' with ')[0]) | |
season_url = root_item['href'] | |
master_db.append((root_name, SITE, 'seasons', season_url)) | |
root_menu = root_tree.find_all('li', itemtype = 'http://schema.org/TVSeries') | |
for root_item in root_menu: | |
try: | |
root_name = root_item.find('meta', itemprop = 'name')['content'] | |
except: | |
root_name = root_item.find_all(itemprop= 'name')[0].string | |
try: | |
season_url = root_item.find('meta', itemprop = 'url')['content'] | |
except: | |
season_url = root_item.find('a', itemprop = 'url')['href'] | |
if root_name.lower() not in root_doubles and root_name.split(' with ')[0].lower() not in root_doubles: | |
root_doubles.append(root_name.lower().split(' with ')[0]) | |
master_db.append((root_name, SITE, 'seasons', season_url)) | |
return master_db | |
def rootlist(): | |
""" Add a container for every show. All logic is in masterlist() """ | |
rootlist = [] | |
rootlist = masterlist() | |
for show in rootlist: | |
_common.add_show(show[0], show[1], show[2], show[3]) | |
_common.set_view('tvshows') | |
def _get_manifest(page_url): | |
""" Try to get the manifest Javascript object for the current page. Input URL can be any kind of page | |
Returns the manifest feed as a JSON object if found, else return False """ | |
triforceManifestFeed = None | |
page_data = _connection.getURL(page_url) | |
page_tree = BeautifulSoup(page_data, 'html5lib') | |
scripts = page_tree.find_all('script') | |
try: | |
for script in scripts: | |
if ('triforceManifestFeed') in script.string: | |
triforceManifestFeed = script.string.split(' = ')[1] | |
triforceManifestFeed = triforceManifestFeed.strip()[:-1] # remove last ; from string | |
triforceManifestFeed = simplejson.loads(triforceManifestFeed) | |
return triforceManifestFeed | |
except: | |
return False | |
def _get_manifest_feed(feed_url): | |
""" Load a single manifest feed as a JSON object. Input should already be a feed URL | |
#ManifestFeed can be added to the end of the URL to aid detection of a URL as amanifest | |
feed, as opposed to a full page URL. #ManifestFeed is removed before calling the URL """ | |
try: | |
if feed_url.endswith('#ManifestFeed'): | |
feed_url = feed_url[:-13] # strip #ManifestFeed from URL | |
page_data = _connection.getURL(feed_url) | |
return simplejson.loads(page_data) | |
except: | |
return False | |
def seasons(show_url = _common.args.url): | |
""" Load the items for a show. This can be "Full Epiodes" and "Clips", or something based | |
on the data. | |
Southpark has a different site structure, so this is redirected to a different function. | |
Some pages have a manifest Javascript object that contains JSON feeds to all episodes. | |
Other pages do not have this. This function tries to find if the show home page has such | |
a feed. If so, only data from the feed is used. If the home page does not have the feed, | |
try to find the URL for the full episodes and the clips pages. For each of these pages | |
the script tries to load the manifest feed. If this cannot be found, add items based on | |
the HTML page. A consequence of this is that some shows can have mixed results: full | |
episides pages does not have a manifest, but clips does. This can lead to duplication of | |
container items. Many shows seem to contain a feed for full episodes, but this feed is empty """ | |
if 'South Park' in _common.args.name: | |
add_items_from_southpark(show_url) | |
else: | |
triforceManifestFeed = _get_manifest(show_url) | |
if triforceManifestFeed: | |
add_items_from_manifestfile(triforceManifestFeed, show_url) | |
else: | |
full_episodes_url = get_full_episodes_url(show_url) | |
clips_url = get_clips_url(show_url) | |
if full_episodes_url: | |
triforceManifestFeed = _get_manifest(full_episodes_url) | |
if triforceManifestFeed: | |
add_items_from_manifestfile(triforceManifestFeed, full_episodes_url) | |
else: | |
_common.add_directory('Full Episodes', SITE, 'episodes', full_episodes_url) | |
if clips_url: | |
triforceManifestFeed = _get_manifest(clips_url) | |
if triforceManifestFeed: | |
add_items_from_manifestfile(triforceManifestFeed, clips_url) | |
else: | |
_common.add_directory('Full Episodes', SITE, 'episodes', clips_url) | |
_common.set_view('seasons') | |
def episodes(episode_url = _common.args.url): | |
""" Add individual episodes. If the URL is a manifest feed, load from JSON, else analyse | |
the HTML of the page """ | |
if episode_url.endswith('#ManifestFeed'): | |
triforceManifestFeed = _get_manifest_feed(episode_url) | |
if triforceManifestFeed: | |
add_video_from_manifestfile(triforceManifestFeed) | |
else: | |
episodes_from_html(episode_url) | |
_common.set_view('episodes') | |
def get_full_episodes_url(show_url): | |
""" Get the URL to the full episodes page """ | |
show_data = _connection.getURL(show_url) | |
show_tree = BeautifulSoup(show_data, 'html5lib') | |
show_menu = None | |
try: | |
show_menu = show_tree.find('a', class_ = 'episodes') | |
except: | |
pass | |
if show_menu is None: | |
show_menu = show_tree.find('a', text = re.compile('full episodes', re.IGNORECASE)) | |
if show_menu is not None: | |
full_episodes_url = show_menu['href'] | |
if 'http' not in full_episodes_url: | |
full_episodes_url = show_url + full_episodes_url | |
return full_episodes_url | |
else: | |
return False | |
def get_clips_url(show_url): | |
""" Get the URL to the clips page """ | |
show_data = _connection.getURL(show_url) | |
show_tree = BeautifulSoup(show_data, 'html5lib') | |
show_menu = None | |
show_menu = show_tree.find('a', href = re.compile('(?<!stand-up)/(video|clips)')) | |
if show_menu is not None: | |
clips_url = show_menu['href'] | |
if 'http' not in clips_url: | |
clips_url = show_url + clips_url | |
return clips_url | |
else: | |
return False | |
def add_items_from_southpark(show_url): | |
""" Add the seasons for South Park """ | |
show_data = _connection.getURL(show_url) | |
seasons = BeautifulSoup(show_data, 'html5lib').find_all('a',class_='seasonbtn') | |
if seasons: | |
for season in seasons: | |
try: | |
display = 'Season %s' %str(int(season.string)) | |
except: | |
display = 'Special %s' %season.string | |
_common.add_directory(display, SITE, 'episodes', season['href'] ) | |
def episodes_from_html(episode_url = _common.args.url, page = 1): | |
""" Add episodes by analysing the HTML of the page """ | |
if page == 1: | |
episode_data = _connection.getURL(episode_url) | |
episode_tree = None | |
try: | |
episode_url = re.compile("var .*Showcase.* = '(.*)'").findall(episode_data)[0] | |
if 'http' not in episode_url: | |
episode_url = BASE + episode_url | |
episode_data = _connection.getURL(episode_url) | |
except: | |
try: | |
episode_tree = BeautifulSoup(episode_data, 'html5lib') | |
episode_url = episode_tree.find('div', class_ = 'content')['data-feed'] | |
episode_data = _connection.getURL(episode_url) | |
episode_tree = BeautifulSoup(episode_data, 'html5lib') | |
except: | |
pass | |
if episode_tree is None: | |
episode_tree = BeautifulSoup(episode_data, 'html5lib') | |
if 'Clips' in _common.args.name : | |
if 'southpark' in episode_url: | |
add_clips_southpark(episode_tree) | |
else: | |
next = episode_tree.find('a', class_ = re.compile('next')) | |
add_video(episode_tree) | |
if next is not None: | |
try: | |
if 'href' in next.attrs: | |
nexturl = next['href'].replace(' ', '+') | |
else: | |
nexturl = next['onclick'].split(';')[0].replace("loadContent('", "").replace("')", "") | |
if 'http' not in nexturl: | |
nexturl = BASE + nexturl | |
if page < int(_addoncompat.get_setting('maxpages')): | |
episodes_from_html(nexturl, page + 1) | |
except: | |
pass | |
else: | |
if 'southpark' in episode_url: | |
add_fullepisodes_southpark(episode_tree) | |
else: | |
next = episode_tree.find('a', class_ = re.compile('next')) | |
add_video(episode_tree, False) | |
if next is not None: | |
try: | |
nexturl = next['href'] | |
if nexturl[0] == '?': | |
nexturl = episode_url.split('?')[0] + nexturl | |
elif 'http' not in nexturl: | |
nexturl = BASE + nexturl | |
if page < int(_addoncompat.get_setting('maxpages')): | |
episodes_from_html(nexturl, page + 1) | |
except: | |
pass | |
def _keyinfeed(keys1, keys2): | |
""" Helper function to find if a key from an list is present in another list """ | |
for key in keys1: | |
if key in keys2: | |
return True | |
return False | |
def add_items_from_manifestfile(triforceManifestFeed, season_url): | |
""" Add container items based on the manifest feed. If there are no items in the feed | |
skip it. Special rule not to add Daily Show items to Colbert Report and vice versa """ | |
if True: #try: | |
feeds = [] | |
for zone in triforceManifestFeed['manifest']['zones']: | |
thiszone = triforceManifestFeed['manifest']['zones'][zone] | |
feed_data = _connection.getURL(thiszone['feed']) | |
feed = simplejson.loads(feed_data) | |
if _keyinfeed(['videos','episodes','playlist','playlists'], feed['result'].keys()) : | |
if 'episodes' in feed['result']: | |
if len(feed['result']['episodes']) == 0: | |
continue | |
elif 'videos' in feed['result']: | |
if len(feed['result']['videos']) == 0: | |
continue | |
elif 'playlist' in feed['result']: | |
if len(feed['result']['playlist']) == 0: | |
continue | |
elif 'playlists' in feed['result']: | |
if len(feed['result']['playlists'][0]) == 0: | |
continue | |
title = '' | |
try: | |
title = feed['result']['promo']['headline'] | |
except: | |
pass | |
if title == '': | |
if ' - ' in thiszone['moduleName']: | |
title = thiszone['moduleName'].split(' - ')[1] | |
else: | |
title = thiszone['moduleName'] | |
if title.endswith(' Promo'): | |
title = title[:-6] | |
feeds.append({'title': title, 'url': thiszone['feed']}) | |
feeds.sort(key = lambda x: x['title']) | |
for feed in feeds: | |
if 'Daily Show' in feed['title'] and 'colbertreport' in season_url: | |
continue | |
if 'Colbert' in feed['title'] and 'dailyshow' in season_url: | |
continue | |
# add #ManifestFeed at the end of the URL, so we can detect that this is a feed, not a full page | |
_common.add_directory(feed['title'], SITE, 'episodes', feed['url'] + "#ManifestFeed") | |
#except: | |
# pass | |
def add_video_from_manifestfile(manifest_feed): | |
""" Add videos based on a manifest feed """ | |
try: | |
shows = [] | |
items = manifest_feed['result'] | |
if 'episodes' in items: | |
items = items['episodes'] | |
elif 'videos' in items: | |
items = items['videos'] | |
elif 'playlist' in items: | |
items = items['playlist']['videos'] | |
elif 'playlists' in items: | |
t_items = [] | |
k = 0 | |
for i in items['playlists']: | |
l = 0 | |
for j in items['playlists'][k]['videos']: | |
t_items.append(items['playlists'][k]['videos'][l]) | |
l = l + 1 | |
k = k + 1 | |
items = t_items | |
for item in items: | |
try: | |
episode_name = item['title'] | |
except: | |
episode_name = item['shortTitle'] | |
epoch = float(item['airDate']) | |
epoch = _common.convert_to_timezone(epoch, '', -5, epoch) | |
episode_airdate = _common.format_date(epoch , '', '%d.%m.%Y', epoch) | |
episode_plot = item['shortDescription'] | |
episode_thumb = item['images'][0]['url'] | |
url = item['url'] | |
if not url: | |
url = item['canonicalURL'] | |
try: | |
season_number = item['season']['seasonNumber'] | |
episode_number = str(int(str(item['season']['episodeNumber'])[len(str(season_number)):])) | |
except: | |
season_number = -1 | |
episode_number = -1 | |
u = sys.argv[0] | |
u += '?url="' + urllib.quote_plus(url) + '"' | |
u += '&mode="' + SITE + '"' | |
u += '&sitemode="play_video"' | |
infoLabels={ 'title' : episode_name, | |
'season' : season_number, | |
'episode' : episode_number, | |
'plot' : episode_plot, | |
'premiered' : episode_airdate } | |
show = {'u': u, 'episode_name': episode_name, 'episode_thumb': episode_thumb, 'infoLabels': infoLabels, 'epoch': epoch} | |
shows.append(show) | |
if len(shows): | |
shows = sorted(shows, key=lambda show: show['epoch'], reverse=True) | |
for show in shows: | |
_common.add_video(show['u'], show['episode_name'], show['episode_thumb'], infoLabels = show['infoLabels'], quality_mode = 'list_qualities') | |
except: | |
pass | |
def add_fullepisodes_southpark(episode_tree): | |
try: | |
episode_menu = episode_tree.find('div', class_ = 'content_carouselwrap').ol.find_all('li', recursive = False) | |
for episode_item in episode_menu: | |
if not episode_item.find('a',class_ = 'unavailable'): | |
episode_name = episode_item.h5.string | |
episode_airdate = episode_item.h6.string.replace('Original Air Date: ', '') | |
episode_airdate = _common.format_date(episode_airdate , '%m.%d.%Y', '%d.%m.%Y') | |
episode_plot = episode_item.p.string | |
episode_thumb = episode_item.img['src'].split('?')[0] | |
url = episode_item.a['href'] | |
try: | |
season_number, episode_number = re.compile('s([0-9]{2})e([0-9]{2})').findall(url)[0] | |
except: | |
episode_number = -1 | |
season_number = -1 | |
u = sys.argv[0] | |
u += '?url="' + urllib.quote_plus(url) + '"' | |
u += '&mode="' + SITE + '"' | |
u += '&sitemode="play_video"' | |
infoLabels={ 'title' : episode_name, | |
'season' : season_number, | |
'episode' : episode_number, | |
'plot' : episode_plot, | |
'premiered' : episode_airdate } | |
_common.add_video(u, episode_name, episode_thumb, infoLabels = infoLabels, quality_mode = 'list_qualities') | |
except: | |
pass | |
def add_video(episode_tree, episode = False): | |
try: | |
episode_menu = episode_tree.find_all(itemtype = 'http://schema.org/TVEpisode') | |
if not episode_menu: | |
episode_menu = episode_tree.find_all(itemtype = 'http://schema.org/VideoObject') | |
for episode_item in episode_menu: | |
if episode == False or episode_item.find(class_ = 'episode'): | |
episode_name = episode_item.find('meta', itemprop = 'name')['content'] | |
episode_plot = episode_item.find('meta', itemprop = 'description')['content'] | |
url = episode_item.find('meta', itemprop = 'url')['content'] | |
try: | |
episode_thumb = episode_item.find('meta', itemprop = 'image')['content'].split('?')[0] | |
print episode_thumb | |
except: | |
try: | |
episode_thumb = episode_item.find('meta', itemprop = 'thumbnailUrl')['content'].split('?')[0] | |
except: | |
episode_thumb = episode_item.find('img')['src'].split('?')[0] | |
try: | |
episode_airdate = episode_item.find('meta', itemprop = 'uploadDate')['content'] | |
except: | |
try: | |
episode_airdate = episode_item.find('meta', itemprop = 'datePublished')['content'] | |
print episode_airdate | |
try: | |
episode_airdate = _common.format_date(episode_airdate, '%B %d, %Y') | |
except: | |
episode_airdate = _common.format_date(episode_airdate, '%b %d, %Y') | |
print episode_airdate | |
except: | |
episode_airdate = -1 | |
try: | |
episode_duration = episode_item.find('meta', itemprop = 'duration')['content'] | |
try: | |
duration_mins, duration_seconds = re.compile('([0-9]*)M([0-9]*)S').findall(episode_duration)[0] | |
episode_duration_seconds = int(duration_mins) * 60 + int(duration_seconds) | |
except: | |
episode_duration_seconds = int(episode_duration.replace('S', '').replace('T','')) | |
except: | |
episode_duration_seconds = -1 | |
try: | |
episode_meta = episode_item.find('div', class_ = 'video_meta').text.split('|')[0] | |
season_number = int(episode_meta.split('-')[0].replace('Season', '').strip()) | |
episode_number = int(episode_meta.split('-')[1].replace('Episode', '').strip()[1:]) | |
except: | |
season_number = -1 | |
episode_number = -1 | |
u = sys.argv[0] | |
u += '?url="' + urllib.quote_plus(url) + '"' | |
u += '&mode="' + SITE + '"' | |
u += '&sitemode="play_video"' | |
infoLabels={ 'title' : episode_name, | |
'durationinseconds' : episode_duration_seconds, | |
'season' : season_number, | |
'episode' : episode_number, | |
'plot' : episode_plot, | |
'premiered' : episode_airdate } | |
_common.add_video(u, episode_name, episode_thumb, infoLabels = infoLabels, quality_mode = 'list_qualities') | |
except: | |
pass | |
def add_clips_southpark(episode_tree): | |
try: | |
episode_menu = episode_tree.find_all('li', class_ = 'clips_thumb') | |
clip_titles = [] | |
for episode_item in episode_menu: | |
episode_name = episode_item.find('a', class_ = 'clips_thumb_link', text = True).string | |
if episode_name not in clip_titles: | |
clip_titles.append(episode_name) | |
episode_plot = episode_item.find('h6').string.replace('"', '') | |
url = episode_item.find('a')['href'] | |
episode_thumb = episode_item.find('img')['src'].split('?')[0] | |
try: | |
episode_duration_seconds = _common.format_seconds(episode_item.find('span', class_ = 'clips_duration').string) | |
except: | |
episode_duration_seconds = -1 | |
try: | |
episode_season = int(episode_item.find('h5', class_ = 'clips_thumb_season').string.replace('Season ', '')) | |
except: | |
episode_season = -1 | |
u = sys.argv[0] | |
u += '?url="' + urllib.quote_plus(url) + '"' | |
u += '&mode="' + SITE + '"' | |
u += '&sitemode="play_video"' | |
infoLabels={ 'title' : episode_name, | |
'duration' : episode_duration_seconds, | |
'season' : episode_season, | |
'plot' : episode_plot, | |
'tvshowtitle' : 'South Park'} | |
_common.add_video(u, episode_name, episode_thumb, infoLabels = infoLabels, quality_mode = 'list_qualities') | |
except: | |
pass | |
def play_video(video_url = _common.args.url): | |
video_data = _connection.getURL(video_url) | |
try: | |
mgid = BeautifulSoup(video_data, 'html5lib').find('div', attrs = {'data-mgid' : True})['data-mgid'] | |
video_url2 = mgid | |
except: | |
video_url2 = re.compile('swfobject\.embedSWF\("(.*?)"').findall(video_data)[0] | |
_main_viacom.play_video(BASE, video_url2) | |
def list_qualities(video_url = _common.args.url): | |
video_data = _connection.getURL(video_url) | |
try: | |
mgid = BeautifulSoup(video_data, 'html5lib').find('div', attrs = {'data-mgid' : True})['data-mgid'] | |
video_url2 = VIDEOURL + mgid | |
except: | |
video_url2 = re.compile('swfobject\.embedSWF\("(.*?)"').findall(video_data)[0] | |
return _main_viacom.list_qualities(BASE, video_url2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment