dreness · January 12, 2018 06:37
diff --git a/owl.py b/owl.py
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 """
 Work in progress. For now, just show URLs to m3u8 files of match videos.
 To use, first do:
 pip install -r requirements.txt

 TODO: pre-season VODs were split out by game; season1 VODs contain all games in a match
 """

 from __future__ import unicode_literals

 from os import getcwd
 import re
 import sys
 import json
 import attr
 from textwrap import dedent
 import argparse
 from pprint import pprint as pp
 from datetime import datetime, timedelta
 import youtube_dl
 import requests
 import validators
 import cachecontrol
 from bs4 import BeautifulSoup


 @attr.s
 class OWGame(object):
    """
    A single game of Overwatch. For us, this represents everything that happens
    in a single VOD file on overwatchleague.com.

    Args:
        matchparser (OWMatchParser): reference to the match parser
        parentmatch (OWMatch): reference to the match containing this game
        gamedesc (str): display description
        gameslug (str): short string that is unique relative to siblings
        date (datetime): date and time of posting
        m3u8 (str): url to this game's m3u8 file that yields the video.
        pageurl (str): url to the web page that holds the m3u8
        duration (datetime): duration of game
    """
    # pylint: disable=too-many-instance-attributes
    # pylint: disable=too-many-arguments
    matchparser = attr.ib()
    parentmatch = attr.ib()
    gamedesc = attr.ib()
    gameslug = attr.ib(default=None)
    date = attr.ib(default=None)
    m3u8 = attr.ib(default=None)
    pageurl = attr.ib(default=None)
    duration = attr.ib(default=None)

    def __str__(self):
        return str(self.gamedesc)

    def find_stream_url(self):
        """
        The HLS stream URL for a game is found on the game's detail page.
        Load the page, scrape the URL, store it as m3u8.
        """
        vid_r = self.matchparser.sess.get(self.pageurl)
        vid_soup = BeautifulSoup(vid_r.text, 'html.parser')

        # The m3u8 url is specified in one of the javascripts
        scripts = vid_soup.find_all("script")
        for script in [x.text for x in scripts]:
            for line in script.split("\n"):
                vals = line.split(" = ")[::1]
                for val in [s.strip("\n").strip(";") for s in vals]:
                    if "m3u8" in val:
                        js = json.loads(val)
                        self.m3u8 = "http:" + js['streams'][0]['streamUrl']

        if not validators.url(self.m3u8):
            print("Failed to identify HLS URL for {}!".format(
                self.gameslug))

    def download(self):
        """Download a video with youtube-dl"""

        # output file path and filename
        ytdl_out = "{}/{}/{}.%(ext)s".format(
            self.matchparser.args.directory,
            self.parentmatch.matchslug,
            self.gameslug,
        )

        opts = dict(outtmpl=ytdl_out, format='bestaudio/best', logger=YTDLLogger(), progress_hooks=[ytdl_hook],
                    forceurl=True, forcefilename=True, restrictfilenames=True, nooverwrites=True,
                    merge_output_format="mp4")

        with youtube_dl.YoutubeDL(opts) as ydl:
            ydl.download([self.m3u8])


 @attr.s
 class OWMatch(object):
    """
    An OWMatch is a group of OWGames in a 'best of' series

        :param matchurl: url to the overview page for this match
        :type matchurl: str
        :param games: series of games that comprise this match
        :type games: list

    """
    # pylint: disable=too-many-arguments
    matchparser = attr.ib()
    matchslug = attr.ib(default=None)
    matchurl = attr.ib(default=None)
    matchdesc = attr.ib(default=None)
    games = attr.ib(default=attr.Factory(list))
    matchdiv = attr.ib(default=None)

    @property
    def duration(self):
        """Sum up the durations of all games in this match"""
        delta = timedelta(0)
        for game in self.games:
            delta = game.duration + delta
        return delta

    def __str__(self):
        return str(self.matchslug)

    def __len__(self):
        return len(self.games)

    def get_game_details(self):
        """Get the game details that can be gleaned from the overview page"""
        for mdiv in self.matchdiv:
            gamedesc = mdiv.find('a').get("data-title").strip()
            game = OWGame(
                matchparser=self.matchparser,
                parentmatch=self,
                gamedesc=gamedesc)
            game.pageurl = mdiv.find('a').get("data-mlg-embed")
            game.date = mdiv.find('a').get("data-date")
            parsed_dt = datetime.strptime(mdiv.span.text, "%M:%S")
            delta = timedelta(
                hours=parsed_dt.hour, minutes=parsed_dt.minute, seconds=parsed_dt.second)
            game.duration = delta
            reg = re.match(r".*?Game (?P<game_number>\d)", gamedesc)
            if reg:
                game_number = reg.groups()[0]
                game.gameslug = "{}-game-{}".format(self.matchslug, game_number)
            else:
                print("Failed to parse game number for {}!".format(gamedesc))
            self.games.append(game)


 @attr.s
 class OWMatchparser(object):
    """
        Scrape the overwatchleague video catalog page to identify each OWMatch

        :param args: cli args dictionary
        :type args: dict
        :param matches: matches parsed by this parser
        :type matches: dict
        :param sess: CacheControl http client session
        :type args: cachecontrol.CacheControl() object
        :returns: {matchslug: OWMatch, ...}
        :rtype: dict

    Each match is named in an <h3> tag on the catalog page, which we'll use as
    OWMatch.matchslug.  The catalog also probably contains a bag of other non-match
    videos, under an <h3> whose text is "VIDEOS". Filter that out
    """

    args = attr.ib()
    matches = attr.ib(default=attr.Factory(dict))
    sess = attr.ib(default=cachecontrol.CacheControl(requests.Session()))

    def __iter__(self):
        return iter(self.matches)

    def __contains__(self, value):
        return value in self.matches

    def __len__(self):
        return len(self.matches)

    def __get__(self, key):
        return self.matches[key]

    def get_match_overviews(self):
        """Load the top-level archive page, detect matches"""
        req = self.sess.get(self.args.url)
        if req.status_code != 200:
            print("Failed to access args.url: {}".format(self.args.url))
            sys.exit(1)

        if not req.text:
            print("Got a response for {}, it's empty!?".format(self.args.url))

        soup = BeautifulSoup(req.text, 'html.parser')

        # All the stuff we want is under soup.section.div
        div = soup.section.div

        # Select tags whose class includes "UnderlineAnimation" and that have
        # an attribute called "data-mlg-embed" (the value is a url to a video page)
        # tiles are the rectangular elements that represent OWGames.
        # tile_regex = re.compile(r"UnderlineAnimation")
        # attr_filter = {"data-mlg-embed": True, "class": tile_regex}
        # tiles = div.find_all(attrs=attr_filter, recursive=True)

        def match_selector(tag):
            """Implement a predicate only slightly too complicated to be in-line"""
            return tag.name == "h3" and tag.text != "VIDEOS"

        # For each match, obtain the corresponding div of OWGames
        for h3_div in div.find_all(match_selector, recursive=False):
            matchslug = h3_div.nextSibling['data-analytics-placement']
            if matchslug is None:
                print("Couldn't find matchslug for match {}!".format(h3_div))
                sys.exit(1)
            matchdiv = h3_div.nextSibling
            if matchdiv is None:
                print("Couldn't get div for match {}".format(h3_div))
                sys.exit(1)

            matchdesc = h3_div.text
            self.matches[matchslug] = OWMatch(
                matchparser=self,
                matchslug=matchslug,
                matchdiv=matchdiv,
                matchdesc=matchdesc)

        return self.matches

    @property
    def games(self, slug=None):
        """Return the game for a gameslug, all games in a match for a matchslug, or all games"""

        games = []
        for match in self.matches.values():
            if slug:
                if match.matchslug == slug:
                    games.append(match.games)
                for game in match.games:
                    if game.gameslug == slug:
                        games.append(game)
            else:
                games.append([game for game in match.games])
        return games


 class YTDLLogger(object):
    """Customize YTDL output?"""

    # pylint: disable=missing-docstring

    def debug(self, msg):
        pass

    def warning(self, msg):
        pass

    @staticmethod
    def error(msg):
        print(msg)


 def ytdl_hook(download):
    """YTDL status callbacks"""
    if download['status'] == 'finished':
        print('Done downloading, now converting ...')
    if download['status'] == 'downloading':
        msg = "Downloading {} for {} seconds with {} of {} bytes. ETA: {}".format(
            download.filename, download.elapsed, download.downloaded_bytes,
            download.total_bytes, download.eta)
        sys.stdout.write(msg)
        sys.stdout.write("\033[K")


 def main():
    """Start here for interactive use"""

    parser = argparse.ArgumentParser(
        description='Download overwatchleague videos', )
    parser.add_argument(
        "-d",
        "--directory",
        type=str,
        help="Path to download directory (cwd by default)",
        default=getcwd(),
    )
    parser.add_argument(
        "-y",
        "--youtube-dl",
        type=str,
        default="youtube-dl",
        help="Path to youtube-dl (find in $PATH by default)",
    )
    parser.add_argument(
        "-u",
        "--url",
        type=str,
        default="https://overwatchleague.com/en-us/videos",
        help=dedent('\n'
                    '				URL to overwatchleague video listing page.\n'
                    '				Default: https://overwatchleague.com/en-us/videos\n'
                    '			'),
    )
    parser.add_argument(
        "--dry-run",
        action='store_true',
        help="Don't download videos",
    )
    parser.add_argument(
        "-g",
        "--get-item",
        type=str,
        help="Download a specific game by slug",
    )
    parser.add_argument(
        "-l",
        "--list-matches",
        action='store_true',
        default=True,
        help="List match metadata",
    )
    parser.add_argument(
        "-i",
        "--interactive",
        action='store_true',
        default=False,
        help="Enter IPython console before exit",
    )
    parser.add_argument(
        "-v",
        "--list-videos",
        action='store_true',
        default=True,
        help="List video metadata",
    )
    args = parser.parse_args()

    # This would be an example of what you'd do if importing this as a module
    # I could go straight for a results dict with:
    # matches = OWMatchparser(args).get_match_overviews()
    # ... but instead store a ref to the match parser to use its methods later
    matchparser = OWMatchparser(args)
    matches = matchparser.get_match_overviews()
    print("Found {} matches.".format(len(matches)))
    for match in matches.values():
        print("Getting game metadata for match: {}".format(match))
        match.get_game_details()
        for game in match.games:
            print("Getting stream URL for game: {}".format(game))
            game.find_stream_url()

    print("\n")
    for slug, match in matches.items():
        print(" • {} - {}".format(slug, match.matchdesc))
        for game in match.games:
            pp(game.m3u8)
        print("\n")

    # import pdb ; pdb.set_trace()
    if args.get_item:
        slug = args.get_item.strip("'").strip('"')
        targets = matchparser.games
        for game in targets:
            game.download()

    if args.interactive:
        from ptpython.repl import embed
        embed(globals(), locals())


 if __name__ == "__main__":
    main()

 # sample m3u8 url:
 # https://mlgmsod-pipeline.akamaized.net/media/production/delivery/73/13/7313aade-cef9-4fff-8021-39e7bed05bda/WOMhqfusMkc_9632a417-8a39-43bb-9d71-5892cdfc4c81_4800k.m3u8
diff --git a/requirements.txt b/requirements.txt
 youtube_dl==2017.12.14
 validators==0.12.0
 requests==2.18.4
 requests_cache==0.4.13
 ptpython==0.41
 attrs==17.3.0
 beautifulsoup4==4.6.0
 cachecontrol==0.12.3
	#!/usr/bin/env python
	# -- coding: UTF-8 --
	"""
	Work in progress. For now, just show URLs to m3u8 files of match videos.
	To use, first do:
	pip install -r requirements.txt

	TODO: pre-season VODs were split out by game; season1 VODs contain all games in a match
	"""

	from __future__ import unicode_literals

	from os import getcwd
	import re
	import sys
	import json
	import attr
	from textwrap import dedent
	import argparse
	from pprint import pprint as pp
	from datetime import datetime, timedelta
	import youtube_dl
	import requests
	import validators
	import cachecontrol
	from bs4 import BeautifulSoup


	@attr.s
	class OWGame(object):
	"""
	A single game of Overwatch. For us, this represents everything that happens
	in a single VOD file on overwatchleague.com.

	Args:
	matchparser (OWMatchParser): reference to the match parser
	parentmatch (OWMatch): reference to the match containing this game
	gamedesc (str): display description
	gameslug (str): short string that is unique relative to siblings
	date (datetime): date and time of posting
	m3u8 (str): url to this game's m3u8 file that yields the video.
	pageurl (str): url to the web page that holds the m3u8
	duration (datetime): duration of game
	"""
	# pylint: disable=too-many-instance-attributes
	# pylint: disable=too-many-arguments
	matchparser = attr.ib()
	parentmatch = attr.ib()
	gamedesc = attr.ib()
	gameslug = attr.ib(default=None)
	date = attr.ib(default=None)
	m3u8 = attr.ib(default=None)
	pageurl = attr.ib(default=None)
	duration = attr.ib(default=None)

	def __str__(self):
	return str(self.gamedesc)

	def find_stream_url(self):
	"""
	The HLS stream URL for a game is found on the game's detail page.
	Load the page, scrape the URL, store it as m3u8.
	"""
	vid_r = self.matchparser.sess.get(self.pageurl)
	vid_soup = BeautifulSoup(vid_r.text, 'html.parser')

	# The m3u8 url is specified in one of the javascripts
	scripts = vid_soup.find_all("script")
	for script in [x.text for x in scripts]:
	for line in script.split("\n"):
	vals = line.split(" = ")[::1]
	for val in [s.strip("\n").strip(";") for s in vals]:
	if "m3u8" in val:
	js = json.loads(val)
	self.m3u8 = "http:" + js['streams'][0]['streamUrl']

	if not validators.url(self.m3u8):
	print("Failed to identify HLS URL for {}!".format(
	self.gameslug))

	def download(self):
	"""Download a video with youtube-dl"""

	# output file path and filename
	ytdl_out = "{}/{}/{}.%(ext)s".format(
	self.matchparser.args.directory,
	self.parentmatch.matchslug,
	self.gameslug,
	)

	opts = dict(outtmpl=ytdl_out, format='bestaudio/best', logger=YTDLLogger(), progress_hooks=[ytdl_hook],
	forceurl=True, forcefilename=True, restrictfilenames=True, nooverwrites=True,
	merge_output_format="mp4")

	with youtube_dl.YoutubeDL(opts) as ydl:
	ydl.download([self.m3u8])


	@attr.s
	class OWMatch(object):
	"""
	An OWMatch is a group of OWGames in a 'best of' series

	:param matchurl: url to the overview page for this match
	:type matchurl: str
	:param games: series of games that comprise this match
	:type games: list

	"""
	# pylint: disable=too-many-arguments
	matchparser = attr.ib()
	matchslug = attr.ib(default=None)
	matchurl = attr.ib(default=None)
	matchdesc = attr.ib(default=None)
	games = attr.ib(default=attr.Factory(list))
	matchdiv = attr.ib(default=None)

	@property
	def duration(self):
	"""Sum up the durations of all games in this match"""
	delta = timedelta(0)
	for game in self.games:
	delta = game.duration + delta
	return delta

	def __str__(self):
	return str(self.matchslug)

	def __len__(self):
	return len(self.games)

	def get_game_details(self):
	"""Get the game details that can be gleaned from the overview page"""
	for mdiv in self.matchdiv:
	gamedesc = mdiv.find('a').get("data-title").strip()
	game = OWGame(
	matchparser=self.matchparser,
	parentmatch=self,
	gamedesc=gamedesc)
	game.pageurl = mdiv.find('a').get("data-mlg-embed")
	game.date = mdiv.find('a').get("data-date")
	parsed_dt = datetime.strptime(mdiv.span.text, "%M:%S")
	delta = timedelta(
	hours=parsed_dt.hour, minutes=parsed_dt.minute, seconds=parsed_dt.second)
	game.duration = delta
	reg = re.match(r".*?Game (?P<game_number>\d)", gamedesc)
	if reg:
	game_number = reg.groups()[0]
	game.gameslug = "{}-game-{}".format(self.matchslug, game_number)
	else:
	print("Failed to parse game number for {}!".format(gamedesc))
	self.games.append(game)


	@attr.s
	class OWMatchparser(object):
	"""
	Scrape the overwatchleague video catalog page to identify each OWMatch

	:param args: cli args dictionary
	:type args: dict
	:param matches: matches parsed by this parser
	:type matches: dict
	:param sess: CacheControl http client session
	:type args: cachecontrol.CacheControl() object
	:returns: {matchslug: OWMatch, ...}
	:rtype: dict

	Each match is named in an <h3> tag on the catalog page, which we'll use as
	OWMatch.matchslug. The catalog also probably contains a bag of other non-match
	videos, under an <h3> whose text is "VIDEOS". Filter that out
	"""

	args = attr.ib()
	matches = attr.ib(default=attr.Factory(dict))
	sess = attr.ib(default=cachecontrol.CacheControl(requests.Session()))

	def __iter__(self):
	return iter(self.matches)

	def __contains__(self, value):
	return value in self.matches

	def __len__(self):
	return len(self.matches)

	def __get__(self, key):
	return self.matches[key]

	def get_match_overviews(self):
	"""Load the top-level archive page, detect matches"""
	req = self.sess.get(self.args.url)
	if req.status_code != 200:
	print("Failed to access args.url: {}".format(self.args.url))
	sys.exit(1)

	if not req.text:
	print("Got a response for {}, it's empty!?".format(self.args.url))

	soup = BeautifulSoup(req.text, 'html.parser')

	# All the stuff we want is under soup.section.div
	div = soup.section.div

	# Select tags whose class includes "UnderlineAnimation" and that have
	# an attribute called "data-mlg-embed" (the value is a url to a video page)
	# tiles are the rectangular elements that represent OWGames.
	# tile_regex = re.compile(r"UnderlineAnimation")
	# attr_filter = {"data-mlg-embed": True, "class": tile_regex}
	# tiles = div.find_all(attrs=attr_filter, recursive=True)

	def match_selector(tag):
	"""Implement a predicate only slightly too complicated to be in-line"""
	return tag.name == "h3" and tag.text != "VIDEOS"

	# For each match, obtain the corresponding div of OWGames
	for h3_div in div.find_all(match_selector, recursive=False):
	matchslug = h3_div.nextSibling['data-analytics-placement']
	if matchslug is None:
	print("Couldn't find matchslug for match {}!".format(h3_div))
	sys.exit(1)
	matchdiv = h3_div.nextSibling
	if matchdiv is None:
	print("Couldn't get div for match {}".format(h3_div))
	sys.exit(1)

	matchdesc = h3_div.text
	self.matches[matchslug] = OWMatch(
	matchparser=self,
	matchslug=matchslug,
	matchdiv=matchdiv,
	matchdesc=matchdesc)

	return self.matches

	@property
	def games(self, slug=None):
	"""Return the game for a gameslug, all games in a match for a matchslug, or all games"""

	games = []
	for match in self.matches.values():
	if slug:
	if match.matchslug == slug:
	games.append(match.games)
	for game in match.games:
	if game.gameslug == slug:
	games.append(game)
	else:
	games.append([game for game in match.games])
	return games


	class YTDLLogger(object):
	"""Customize YTDL output?"""

	# pylint: disable=missing-docstring

	def debug(self, msg):
	pass

	def warning(self, msg):
	pass

	@staticmethod
	def error(msg):
	print(msg)


	def ytdl_hook(download):
	"""YTDL status callbacks"""
	if download['status'] == 'finished':
	print('Done downloading, now converting ...')
	if download['status'] == 'downloading':
	msg = "Downloading {} for {} seconds with {} of {} bytes. ETA: {}".format(
	download.filename, download.elapsed, download.downloaded_bytes,
	download.total_bytes, download.eta)
	sys.stdout.write(msg)
	sys.stdout.write("\033[K")


	def main():
	"""Start here for interactive use"""

	parser = argparse.ArgumentParser(
	description='Download overwatchleague videos', )
	parser.add_argument(
	"-d",
	"--directory",
	type=str,
	help="Path to download directory (cwd by default)",
	default=getcwd(),
	)
	parser.add_argument(
	"-y",
	"--youtube-dl",
	type=str,
	default="youtube-dl",
	help="Path to youtube-dl (find in $PATH by default)",
	)
	parser.add_argument(
	"-u",
	"--url",
	type=str,
	default="https://overwatchleague.com/en-us/videos",
	help=dedent('\n'
	' URL to overwatchleague video listing page.\n'
	' Default: https://overwatchleague.com/en-us/videos\n'
	' '),
	)
	parser.add_argument(
	"--dry-run",
	action='store_true',
	help="Don't download videos",
	)
	parser.add_argument(
	"-g",
	"--get-item",
	type=str,
	help="Download a specific game by slug",
	)
	parser.add_argument(
	"-l",
	"--list-matches",
	action='store_true',
	default=True,
	help="List match metadata",
	)
	parser.add_argument(
	"-i",
	"--interactive",
	action='store_true',
	default=False,
	help="Enter IPython console before exit",
	)
	parser.add_argument(
	"-v",
	"--list-videos",
	action='store_true',
	default=True,
	help="List video metadata",
	)
	args = parser.parse_args()

	# This would be an example of what you'd do if importing this as a module
	# I could go straight for a results dict with:
	# matches = OWMatchparser(args).get_match_overviews()
	# ... but instead store a ref to the match parser to use its methods later
	matchparser = OWMatchparser(args)
	matches = matchparser.get_match_overviews()
	print("Found {} matches.".format(len(matches)))
	for match in matches.values():
	print("Getting game metadata for match: {}".format(match))
	match.get_game_details()
	for game in match.games:
	print("Getting stream URL for game: {}".format(game))
	game.find_stream_url()

	print("\n")
	for slug, match in matches.items():
	print(" • {} - {}".format(slug, match.matchdesc))
	for game in match.games:
	pp(game.m3u8)
	print("\n")

	# import pdb ; pdb.set_trace()
	if args.get_item:
	slug = args.get_item.strip("'").strip('"')
	targets = matchparser.games
	for game in targets:
	game.download()

	if args.interactive:
	from ptpython.repl import embed
	embed(globals(), locals())


	if __name__ == "__main__":
	main()

	# sample m3u8 url:
	# https://mlgmsod-pipeline.akamaized.net/media/production/delivery/73/13/7313aade-cef9-4fff-8021-39e7bed05bda/WOMhqfusMkc_9632a417-8a39-43bb-9d71-5892cdfc4c81_4800k.m3u8
	youtube_dl==2017.12.14
	validators==0.12.0
	requests==2.18.4
	requests_cache==0.4.13
	ptpython==0.41
	attrs==17.3.0
	beautifulsoup4==4.6.0
	cachecontrol==0.12.3