Last active
October 29, 2017 15:59
-
-
Save p3t3r67x0/5d535dde9bb79cf9ffddeb644e21fd5c to your computer and use it in GitHub Desktop.
Extract mp4 url from ARTE.tv and a proper title
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import json | |
import requests | |
from lxml import html | |
from urlparse import urlparse | |
from fake_useragent import UserAgent | |
ua = UserAgent() | |
def replace_punctuation_mark(text): | |
return text.replace(u'°', '').replace(',', '').replace('-', '').replace(' ', '_').replace('/', '').replace(':', '').replace('!', '').replace('?', '').replace(' ', '_').replace('.', '').replace('\'', '') | |
def find_video_url(json_content): | |
json_content = json.loads(json_content) | |
content_title = replace_punctuation_mark(json_content['videoJsonPlayer']['VTI'].lower()) | |
try: | |
content_subtitle = u'_{}'.format(replace_punctuation_mark(json_content['videoJsonPlayer']['subtitle'].lower())) | |
except KeyError: | |
content_subtitle = '' | |
print u'{}{}.mp4'.format(content_title, content_subtitle).replace('__', '_').replace('_.', '.') | |
return json_content['videoJsonPlayer']['VSR']['HTTPS_SQ_1']['url'] | |
def find_iframe_url(text): | |
document = html.document_fromstring(text) | |
target_url = document.xpath('//iframe/@src')[0] | |
return target_url | |
def request_content(url, ua_string): | |
headers = {'User-Agent': ua_string, 'Origin': 'https://www.arte.tv'} | |
response = requests.get(url, headers = headers) | |
if response.status_code == 200: | |
return response.text | |
def extract_target_json_url(url): | |
iframe_url = url.replace('json_url=', '').replace('%3A', ':').replace('%3a', ':').replace('%2F', '/').replace('%2f', '/').replace('%3F', '?').replace('%3f', '?').replace('%3D1', '=').replace('%3d1', '=').replace('%26', '&').replace('lifeCycle', 'lifecycle') | |
target_url = urlparse(iframe_url).query | |
return target_url | |
def main(): | |
if len(sys.argv) > 1: | |
arte_url = sys.argv[1] | |
else: | |
print 'Enter the ARTE.tv url you want to download the mp4!' | |
sys.exit(1) | |
content = request_content(arte_url, ua.chrome) | |
iframe_url = find_iframe_url(content) | |
target_json_url = extract_target_json_url(iframe_url) | |
json_content = request_content(target_json_url, ua.chrome) | |
print find_video_url(json_content) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment