Last active
January 12, 2018 06:37
-
-
Save dreness/a55609dd9a5df3e5712b3921741d5550 to your computer and use it in GitHub Desktop.
Reveal OverwatchLeague video URLs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: UTF-8 -*- | |
| """ | |
| Work in progress. For now, just show URLs to m3u8 files of match videos. | |
| To use, first do: | |
| pip install -r requirements.txt | |
| TODO: pre-season VODs were split out by game; season1 VODs contain all games in a match | |
| """ | |
| from __future__ import unicode_literals | |
| from os import getcwd | |
| import re | |
| import sys | |
| import json | |
| import attr | |
| from textwrap import dedent | |
| import argparse | |
| from pprint import pprint as pp | |
| from datetime import datetime, timedelta | |
| import youtube_dl | |
| import requests | |
| import validators | |
| import cachecontrol | |
| from bs4 import BeautifulSoup | |
| @attr.s | |
| class OWGame(object): | |
| """ | |
| A single game of Overwatch. For us, this represents everything that happens | |
| in a single VOD file on overwatchleague.com. | |
| Args: | |
| matchparser (OWMatchParser): reference to the match parser | |
| parentmatch (OWMatch): reference to the match containing this game | |
| gamedesc (str): display description | |
| gameslug (str): short string that is unique relative to siblings | |
| date (datetime): date and time of posting | |
| m3u8 (str): url to this game's m3u8 file that yields the video. | |
| pageurl (str): url to the web page that holds the m3u8 | |
| duration (datetime): duration of game | |
| """ | |
| # pylint: disable=too-many-instance-attributes | |
| # pylint: disable=too-many-arguments | |
| matchparser = attr.ib() | |
| parentmatch = attr.ib() | |
| gamedesc = attr.ib() | |
| gameslug = attr.ib(default=None) | |
| date = attr.ib(default=None) | |
| m3u8 = attr.ib(default=None) | |
| pageurl = attr.ib(default=None) | |
| duration = attr.ib(default=None) | |
| def __str__(self): | |
| return str(self.gamedesc) | |
| def find_stream_url(self): | |
| """ | |
| The HLS stream URL for a game is found on the game's detail page. | |
| Load the page, scrape the URL, store it as m3u8. | |
| """ | |
| vid_r = self.matchparser.sess.get(self.pageurl) | |
| vid_soup = BeautifulSoup(vid_r.text, 'html.parser') | |
| # The m3u8 url is specified in one of the javascripts | |
| scripts = vid_soup.find_all("script") | |
| for script in [x.text for x in scripts]: | |
| for line in script.split("\n"): | |
| vals = line.split(" = ")[::1] | |
| for val in [s.strip("\n").strip(";") for s in vals]: | |
| if "m3u8" in val: | |
| js = json.loads(val) | |
| self.m3u8 = "http:" + js['streams'][0]['streamUrl'] | |
| if not validators.url(self.m3u8): | |
| print("Failed to identify HLS URL for {}!".format( | |
| self.gameslug)) | |
| def download(self): | |
| """Download a video with youtube-dl""" | |
| # output file path and filename | |
| ytdl_out = "{}/{}/{}.%(ext)s".format( | |
| self.matchparser.args.directory, | |
| self.parentmatch.matchslug, | |
| self.gameslug, | |
| ) | |
| opts = dict(outtmpl=ytdl_out, format='bestaudio/best', logger=YTDLLogger(), progress_hooks=[ytdl_hook], | |
| forceurl=True, forcefilename=True, restrictfilenames=True, nooverwrites=True, | |
| merge_output_format="mp4") | |
| with youtube_dl.YoutubeDL(opts) as ydl: | |
| ydl.download([self.m3u8]) | |
| @attr.s | |
| class OWMatch(object): | |
| """ | |
| An OWMatch is a group of OWGames in a 'best of' series | |
| :param matchurl: url to the overview page for this match | |
| :type matchurl: str | |
| :param games: series of games that comprise this match | |
| :type games: list | |
| """ | |
| # pylint: disable=too-many-arguments | |
| matchparser = attr.ib() | |
| matchslug = attr.ib(default=None) | |
| matchurl = attr.ib(default=None) | |
| matchdesc = attr.ib(default=None) | |
| games = attr.ib(default=attr.Factory(list)) | |
| matchdiv = attr.ib(default=None) | |
| @property | |
| def duration(self): | |
| """Sum up the durations of all games in this match""" | |
| delta = timedelta(0) | |
| for game in self.games: | |
| delta = game.duration + delta | |
| return delta | |
| def __str__(self): | |
| return str(self.matchslug) | |
| def __len__(self): | |
| return len(self.games) | |
| def get_game_details(self): | |
| """Get the game details that can be gleaned from the overview page""" | |
| for mdiv in self.matchdiv: | |
| gamedesc = mdiv.find('a').get("data-title").strip() | |
| game = OWGame( | |
| matchparser=self.matchparser, | |
| parentmatch=self, | |
| gamedesc=gamedesc) | |
| game.pageurl = mdiv.find('a').get("data-mlg-embed") | |
| game.date = mdiv.find('a').get("data-date") | |
| parsed_dt = datetime.strptime(mdiv.span.text, "%M:%S") | |
| delta = timedelta( | |
| hours=parsed_dt.hour, minutes=parsed_dt.minute, seconds=parsed_dt.second) | |
| game.duration = delta | |
| reg = re.match(r".*?Game (?P<game_number>\d)", gamedesc) | |
| if reg: | |
| game_number = reg.groups()[0] | |
| game.gameslug = "{}-game-{}".format(self.matchslug, game_number) | |
| else: | |
| print("Failed to parse game number for {}!".format(gamedesc)) | |
| self.games.append(game) | |
| @attr.s | |
| class OWMatchparser(object): | |
| """ | |
| Scrape the overwatchleague video catalog page to identify each OWMatch | |
| :param args: cli args dictionary | |
| :type args: dict | |
| :param matches: matches parsed by this parser | |
| :type matches: dict | |
| :param sess: CacheControl http client session | |
| :type args: cachecontrol.CacheControl() object | |
| :returns: {matchslug: OWMatch, ...} | |
| :rtype: dict | |
| Each match is named in an <h3> tag on the catalog page, which we'll use as | |
| OWMatch.matchslug. The catalog also probably contains a bag of other non-match | |
| videos, under an <h3> whose text is "VIDEOS". Filter that out | |
| """ | |
| args = attr.ib() | |
| matches = attr.ib(default=attr.Factory(dict)) | |
| sess = attr.ib(default=cachecontrol.CacheControl(requests.Session())) | |
| def __iter__(self): | |
| return iter(self.matches) | |
| def __contains__(self, value): | |
| return value in self.matches | |
| def __len__(self): | |
| return len(self.matches) | |
| def __get__(self, key): | |
| return self.matches[key] | |
| def get_match_overviews(self): | |
| """Load the top-level archive page, detect matches""" | |
| req = self.sess.get(self.args.url) | |
| if req.status_code != 200: | |
| print("Failed to access args.url: {}".format(self.args.url)) | |
| sys.exit(1) | |
| if not req.text: | |
| print("Got a response for {}, it's empty!?".format(self.args.url)) | |
| soup = BeautifulSoup(req.text, 'html.parser') | |
| # All the stuff we want is under soup.section.div | |
| div = soup.section.div | |
| # Select tags whose class includes "UnderlineAnimation" and that have | |
| # an attribute called "data-mlg-embed" (the value is a url to a video page) | |
| # tiles are the rectangular elements that represent OWGames. | |
| # tile_regex = re.compile(r"UnderlineAnimation") | |
| # attr_filter = {"data-mlg-embed": True, "class": tile_regex} | |
| # tiles = div.find_all(attrs=attr_filter, recursive=True) | |
| def match_selector(tag): | |
| """Implement a predicate only slightly too complicated to be in-line""" | |
| return tag.name == "h3" and tag.text != "VIDEOS" | |
| # For each match, obtain the corresponding div of OWGames | |
| for h3_div in div.find_all(match_selector, recursive=False): | |
| matchslug = h3_div.nextSibling['data-analytics-placement'] | |
| if matchslug is None: | |
| print("Couldn't find matchslug for match {}!".format(h3_div)) | |
| sys.exit(1) | |
| matchdiv = h3_div.nextSibling | |
| if matchdiv is None: | |
| print("Couldn't get div for match {}".format(h3_div)) | |
| sys.exit(1) | |
| matchdesc = h3_div.text | |
| self.matches[matchslug] = OWMatch( | |
| matchparser=self, | |
| matchslug=matchslug, | |
| matchdiv=matchdiv, | |
| matchdesc=matchdesc) | |
| return self.matches | |
| @property | |
| def games(self, slug=None): | |
| """Return the game for a gameslug, all games in a match for a matchslug, or all games""" | |
| games = [] | |
| for match in self.matches.values(): | |
| if slug: | |
| if match.matchslug == slug: | |
| games.append(match.games) | |
| for game in match.games: | |
| if game.gameslug == slug: | |
| games.append(game) | |
| else: | |
| games.append([game for game in match.games]) | |
| return games | |
| class YTDLLogger(object): | |
| """Customize YTDL output?""" | |
| # pylint: disable=missing-docstring | |
| def debug(self, msg): | |
| pass | |
| def warning(self, msg): | |
| pass | |
| @staticmethod | |
| def error(msg): | |
| print(msg) | |
| def ytdl_hook(download): | |
| """YTDL status callbacks""" | |
| if download['status'] == 'finished': | |
| print('Done downloading, now converting ...') | |
| if download['status'] == 'downloading': | |
| msg = "Downloading {} for {} seconds with {} of {} bytes. ETA: {}".format( | |
| download.filename, download.elapsed, download.downloaded_bytes, | |
| download.total_bytes, download.eta) | |
| sys.stdout.write(msg) | |
| sys.stdout.write("\033[K") | |
| def main(): | |
| """Start here for interactive use""" | |
| parser = argparse.ArgumentParser( | |
| description='Download overwatchleague videos', ) | |
| parser.add_argument( | |
| "-d", | |
| "--directory", | |
| type=str, | |
| help="Path to download directory (cwd by default)", | |
| default=getcwd(), | |
| ) | |
| parser.add_argument( | |
| "-y", | |
| "--youtube-dl", | |
| type=str, | |
| default="youtube-dl", | |
| help="Path to youtube-dl (find in $PATH by default)", | |
| ) | |
| parser.add_argument( | |
| "-u", | |
| "--url", | |
| type=str, | |
| default="https://overwatchleague.com/en-us/videos", | |
| help=dedent('\n' | |
| ' URL to overwatchleague video listing page.\n' | |
| ' Default: https://overwatchleague.com/en-us/videos\n' | |
| ' '), | |
| ) | |
| parser.add_argument( | |
| "--dry-run", | |
| action='store_true', | |
| help="Don't download videos", | |
| ) | |
| parser.add_argument( | |
| "-g", | |
| "--get-item", | |
| type=str, | |
| help="Download a specific game by slug", | |
| ) | |
| parser.add_argument( | |
| "-l", | |
| "--list-matches", | |
| action='store_true', | |
| default=True, | |
| help="List match metadata", | |
| ) | |
| parser.add_argument( | |
| "-i", | |
| "--interactive", | |
| action='store_true', | |
| default=False, | |
| help="Enter IPython console before exit", | |
| ) | |
| parser.add_argument( | |
| "-v", | |
| "--list-videos", | |
| action='store_true', | |
| default=True, | |
| help="List video metadata", | |
| ) | |
| args = parser.parse_args() | |
| # This would be an example of what you'd do if importing this as a module | |
| # I could go straight for a results dict with: | |
| # matches = OWMatchparser(args).get_match_overviews() | |
| # ... but instead store a ref to the match parser to use its methods later | |
| matchparser = OWMatchparser(args) | |
| matches = matchparser.get_match_overviews() | |
| print("Found {} matches.".format(len(matches))) | |
| for match in matches.values(): | |
| print("Getting game metadata for match: {}".format(match)) | |
| match.get_game_details() | |
| for game in match.games: | |
| print("Getting stream URL for game: {}".format(game)) | |
| game.find_stream_url() | |
| print("\n") | |
| for slug, match in matches.items(): | |
| print(" • {} - {}".format(slug, match.matchdesc)) | |
| for game in match.games: | |
| pp(game.m3u8) | |
| print("\n") | |
| # import pdb ; pdb.set_trace() | |
| if args.get_item: | |
| slug = args.get_item.strip("'").strip('"') | |
| targets = matchparser.games | |
| for game in targets: | |
| game.download() | |
| if args.interactive: | |
| from ptpython.repl import embed | |
| embed(globals(), locals()) | |
| if __name__ == "__main__": | |
| main() | |
| # sample m3u8 url: | |
| # https://mlgmsod-pipeline.akamaized.net/media/production/delivery/73/13/7313aade-cef9-4fff-8021-39e7bed05bda/WOMhqfusMkc_9632a417-8a39-43bb-9d71-5892cdfc4c81_4800k.m3u8 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| youtube_dl==2017.12.14 | |
| validators==0.12.0 | |
| requests==2.18.4 | |
| requests_cache==0.4.13 | |
| ptpython==0.41 | |
| attrs==17.3.0 | |
| beautifulsoup4==4.6.0 | |
| cachecontrol==0.12.3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment