Skip to content

Instantly share code, notes, and snippets.

@BertrandBordage
Last active February 14, 2016 19:48
Show Gist options
  • Save BertrandBordage/1ba747fbadd1cbef3b2f to your computer and use it in GitHub Desktop.
Save BertrandBordage/1ba747fbadd1cbef3b2f to your computer and use it in GitHub Desktop.
Arte downloader
#!/usr/bin/env python3
import json
from math import ceil
import re
from subprocess import check_output
import sys
from urllib.parse import unquote
from urllib.request import urlopen
SUBMITTED_URL = input('Enter the video URL: ')
IFRAME_URL_RE = re.compile(r'arte_vp_url_oembed=[\'"]([^\'"]+)[\'"]')
ARTE_EMBED_URLS = (
'http://www.arte.tv/arte_vp/index.php',
'https://www-secure.arte.tv/arte_vp/embed.php',
)
IFRAME_RE = re.compile(r'<iframe [^>]+ src="(?:%s)\?json_url=([^&]+)[^"]+"></iframe>'
% '|'.join([re.escape(url) for url in ARTE_EMBED_URLS]))
JSON_URL_RE = re.compile(r'arte_vp_url=[\'"]([^\'"]+)[\'"]')
INVALID_FILENAME_CHARS_RE = re.compile(r'[^\-_\.\(\) \w\d]')
def check_response(response):
assert res.getcode() == 200, 'Unable to correctly fetch data.'
class ArteVideoVersion:
def __init__(self, video, data):
self.video = video
self.language = data['versionLibelle']
self.width = data['width']
self.height = data['height']
self.bitrate = data['bitrate']
self.url = data['url']
def __repr__(self):
return '<ArteVideoVersion %s [%s]>' % self.render()
def __str__(self):
return self.render(show_language=False)
def render(self, show_language=True):
out = '%4d×%-4d@%5d kb/s (%s)' % (
self.width, self.height, self.bitrate, self.estimated_size)
if show_language:
out = '%s %s' % (self.language, out)
return out
def __hash__(self):
return hash(self.url)
def __lt__(self, other):
return ((self.language, self.width, self.height, self.bitrate)
< (other.language, other.width, other.height, other.bitrate))
@property
def estimated_size(self):
return '≥%dMB' % ceil((self.video.duration * self.bitrate) / 8e3)
class ArteVideo:
def __init__(self, iframe_url=None, json_url=None):
if iframe_url is not None:
self.iframe_url = iframe_url
res = urlopen(url)
check_response(res)
data = json.loads(res.read().decode())
self.json_url = IFRAME_RE.match(data['html']).group(1)
if json_url is not None:
self.json_url = json_url
res = urlopen(unquote(self.json_url))
check_response(res)
data = json.loads(res.read().decode())['videoJsonPlayer']
self.title = data.get('VTI')
self.duration = data['videoDurationSeconds']
versions_data = data['VSR']
self.versions = set()
for version_data in versions_data.values():
if version_data['mediaType'] == 'mp4':
try:
self.versions.add(ArteVideoVersion(self, version_data))
except KeyError: # When data is missing from JSONs.
pass
def __repr__(self):
return '<ArteVideo %s>' % self
def __str__(self):
return '%s (%d seconds)' % (self.title, self.duration)
def __hash__(self):
return hash(self.json_url)
def __lt__(self, other):
return self.duration < other.duration
print('Fetching metadata for this page…')
res = urlopen(SUBMITTED_URL)
check_response(res)
content = res.read().decode()
iframe_urls = {url.replace('&amp;', '&') for url in IFRAME_URL_RE.findall(content)}
json_urls = {url.replace('&amp;', '&') for url in JSON_URL_RE.findall(content)}
other_json_urls = {url.replace('&amp;', '&') for url in IFRAME_RE.findall(content)}
videos = set()
for url in iframe_urls:
video = ArteVideo(iframe_url=url)
if video.title is not None:
videos.add(video)
for url in json_urls:
video = ArteVideo(json_url=url)
if video.title is not None:
videos.add(video)
for url in other_json_urls:
video = ArteVideo(json_url=url)
if video.title is not None:
videos.add(video)
videos = sorted(videos, reverse=True)
print('Fetched!')
def ask_among(question, choices):
if len(choices) == 1:
return choices[0]
print()
print(question)
for i, choice in enumerate(choices):
print('%d. %s' % (i+1, choice))
selected = None
while True:
selected = input('Enter the number: ')
try:
selected = int(selected)
if selected == 0:
continue
return choices[selected-1]
except (ValueError, IndexError):
pass
if videos:
selected_video = ask_among(
'%d videos found in this page, which to choose?' % len(videos), videos)
selected_language = ask_among(
'Which language would you like to download?',
sorted({v.language for v in selected_video.versions}))
selected_version = ask_among(
'Which quality would you like to download?',
sorted({v for v in selected_video.versions if v.language == selected_language}))
filename = '%s - %s.mp4' % (selected_version.video.title, selected_version.language)
filename = INVALID_FILENAME_CHARS_RE.sub('_', filename)
print()
check_output(['wget', '-O', filename, selected_version.url])
else:
print('No video found on this page.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment