Last active
February 14, 2016 19:48
-
-
Save BertrandBordage/1ba747fbadd1cbef3b2f to your computer and use it in GitHub Desktop.
Arte downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
from math import ceil | |
import re | |
from subprocess import check_output | |
import sys | |
from urllib.parse import unquote | |
from urllib.request import urlopen | |
SUBMITTED_URL = input('Enter the video URL: ') | |
IFRAME_URL_RE = re.compile(r'arte_vp_url_oembed=[\'"]([^\'"]+)[\'"]') | |
ARTE_EMBED_URLS = ( | |
'http://www.arte.tv/arte_vp/index.php', | |
'https://www-secure.arte.tv/arte_vp/embed.php', | |
) | |
IFRAME_RE = re.compile(r'<iframe [^>]+ src="(?:%s)\?json_url=([^&]+)[^"]+"></iframe>' | |
% '|'.join([re.escape(url) for url in ARTE_EMBED_URLS])) | |
JSON_URL_RE = re.compile(r'arte_vp_url=[\'"]([^\'"]+)[\'"]') | |
INVALID_FILENAME_CHARS_RE = re.compile(r'[^\-_\.\(\) \w\d]') | |
def check_response(response): | |
assert res.getcode() == 200, 'Unable to correctly fetch data.' | |
class ArteVideoVersion: | |
def __init__(self, video, data): | |
self.video = video | |
self.language = data['versionLibelle'] | |
self.width = data['width'] | |
self.height = data['height'] | |
self.bitrate = data['bitrate'] | |
self.url = data['url'] | |
def __repr__(self): | |
return '<ArteVideoVersion %s [%s]>' % self.render() | |
def __str__(self): | |
return self.render(show_language=False) | |
def render(self, show_language=True): | |
out = '%4d×%-4d@%5d kb/s (%s)' % ( | |
self.width, self.height, self.bitrate, self.estimated_size) | |
if show_language: | |
out = '%s %s' % (self.language, out) | |
return out | |
def __hash__(self): | |
return hash(self.url) | |
def __lt__(self, other): | |
return ((self.language, self.width, self.height, self.bitrate) | |
< (other.language, other.width, other.height, other.bitrate)) | |
@property | |
def estimated_size(self): | |
return '≥%dMB' % ceil((self.video.duration * self.bitrate) / 8e3) | |
class ArteVideo: | |
def __init__(self, iframe_url=None, json_url=None): | |
if iframe_url is not None: | |
self.iframe_url = iframe_url | |
res = urlopen(url) | |
check_response(res) | |
data = json.loads(res.read().decode()) | |
self.json_url = IFRAME_RE.match(data['html']).group(1) | |
if json_url is not None: | |
self.json_url = json_url | |
res = urlopen(unquote(self.json_url)) | |
check_response(res) | |
data = json.loads(res.read().decode())['videoJsonPlayer'] | |
self.title = data.get('VTI') | |
self.duration = data['videoDurationSeconds'] | |
versions_data = data['VSR'] | |
self.versions = set() | |
for version_data in versions_data.values(): | |
if version_data['mediaType'] == 'mp4': | |
try: | |
self.versions.add(ArteVideoVersion(self, version_data)) | |
except KeyError: # When data is missing from JSONs. | |
pass | |
def __repr__(self): | |
return '<ArteVideo %s>' % self | |
def __str__(self): | |
return '%s (%d seconds)' % (self.title, self.duration) | |
def __hash__(self): | |
return hash(self.json_url) | |
def __lt__(self, other): | |
return self.duration < other.duration | |
print('Fetching metadata for this page…') | |
res = urlopen(SUBMITTED_URL) | |
check_response(res) | |
content = res.read().decode() | |
iframe_urls = {url.replace('&', '&') for url in IFRAME_URL_RE.findall(content)} | |
json_urls = {url.replace('&', '&') for url in JSON_URL_RE.findall(content)} | |
other_json_urls = {url.replace('&', '&') for url in IFRAME_RE.findall(content)} | |
videos = set() | |
for url in iframe_urls: | |
video = ArteVideo(iframe_url=url) | |
if video.title is not None: | |
videos.add(video) | |
for url in json_urls: | |
video = ArteVideo(json_url=url) | |
if video.title is not None: | |
videos.add(video) | |
for url in other_json_urls: | |
video = ArteVideo(json_url=url) | |
if video.title is not None: | |
videos.add(video) | |
videos = sorted(videos, reverse=True) | |
print('Fetched!') | |
def ask_among(question, choices): | |
if len(choices) == 1: | |
return choices[0] | |
print() | |
print(question) | |
for i, choice in enumerate(choices): | |
print('%d. %s' % (i+1, choice)) | |
selected = None | |
while True: | |
selected = input('Enter the number: ') | |
try: | |
selected = int(selected) | |
if selected == 0: | |
continue | |
return choices[selected-1] | |
except (ValueError, IndexError): | |
pass | |
if videos: | |
selected_video = ask_among( | |
'%d videos found in this page, which to choose?' % len(videos), videos) | |
selected_language = ask_among( | |
'Which language would you like to download?', | |
sorted({v.language for v in selected_video.versions})) | |
selected_version = ask_among( | |
'Which quality would you like to download?', | |
sorted({v for v in selected_video.versions if v.language == selected_language})) | |
filename = '%s - %s.mp4' % (selected_version.video.title, selected_version.language) | |
filename = INVALID_FILENAME_CHARS_RE.sub('_', filename) | |
print() | |
check_output(['wget', '-O', filename, selected_version.url]) | |
else: | |
print('No video found on this page.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment