Last active
January 24, 2022 10:51
-
-
Save MinePlayersPE/589edb2e424f272bf82e3eaa4beb6130 to your computer and use it in GitHub Desktop.
iq.com yt-dlp extractor plugin (OUTDATED, SEE IqIE IN https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/iqiyi.py INSTEAD)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from __future__ import unicode_literals | |
import base64 | |
import json | |
import random | |
# ⚠ Don't use relative imports | |
from yt_dlp.extractor.common import InfoExtractor | |
from yt_dlp.extractor.openload import PhantomJSwrapper | |
from yt_dlp.utils import ( | |
ExtractorError, | |
float_or_none, | |
join_nonempty, | |
js_to_json, | |
parse_age_limit, | |
parse_duration, | |
parse_iso8601, | |
parse_resolution, | |
qualities, | |
str_or_none, | |
traverse_obj, | |
urljoin | |
) | |
# ℹ️ Instructions on making extractors can be found at: | |
# 🔗 https://github.com/ytdl-org/youtube-dl#adding-support-for-a-new-site | |
class IqIE(InfoExtractor): | |
IE_NAME = 'iq.com' | |
IE_DESC = 'International version of iQiyi' | |
_VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w-]*-)?(?P<id>\w+)' | |
_BID_TAGS = { | |
'100': '240P', | |
'200': '360P', | |
'300': '480P', | |
'500': '720P', | |
'600': '1080P', | |
'610': '1080P50', | |
'700': '2K', | |
'800': '4K', | |
} | |
_LID_TAGS = { | |
'1': 'zh_CN', | |
'2': 'zh_TW', | |
'3': 'en', | |
'18': 'th', | |
'21': 'my', | |
'23': 'vi', | |
'24': 'id', | |
'26': 'es', | |
'28': 'ar', | |
} | |
def _extract_vms_player_js(self, webpage, video_id): | |
player_js_cache = self._downloader.cache.load('iq', 'player_js') | |
if player_js_cache: | |
return player_js_cache | |
webpack_js_url = self._proto_relative_url(self._search_regex(r'<script src="([\w\./]+/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) | |
webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS') | |
webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex( | |
r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1,2))] | |
for module_index in reversed(webpack_map2.keys()): | |
module_js = self._download_webpage( | |
f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', | |
video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' | |
if 'vms request' in module_js: | |
self._downloader.cache.store('iq', 'player_js', module_js) | |
return module_js | |
raise ExtractorError('Unable to extract player JS') | |
def _extract_cmd5x_function(self, webpage, video_id): | |
return self._search_regex(r',\s*(function\s*\([^\)]*\)\s*{\s*var _qda.+_qdc\(\)\s*})\s*,', | |
self._extract_vms_player_js(webpage, video_id), 'signature function') | |
def _update_bid_tags(self, webpage, video_id): | |
extracted_bid_tags = self._parse_json( | |
self._search_regex( | |
r'arguments\[1\][^,]*,\s*function\(\w,\w,\w\)\s*{\s*"use strict";\s*var \w=({.+}})\s*,\s*\w\s*=\s*{getNewVd', | |
self._extract_vms_player_js(webpage, video_id), 'video tags', default=''), | |
video_id, transform_source=js_to_json, fatal=False) | |
if not extracted_bid_tags: | |
return | |
self._BID_TAGS = {bid: extracted_bid_tags[bid]['value'] for bid in extracted_bid_tags.keys()} | |
def _real_extract(self, url): | |
video_id = self._match_id(url) | |
webpage = self._download_webpage(url, video_id) | |
self._update_bid_tags(webpage, video_id) | |
page_data = self._search_nextjs_data(webpage, video_id)['props']['initialState']['play'] | |
video_info = page_data['curVideoInfo'] | |
tvid, vid = video_info['tvId'], video_info['vid'] | |
dfp = self._get_cookies('iq.com')['__dfp'].value if self._get_cookies('https://iq.com').get('__dfp') else '' | |
js_bid_list = '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']' | |
cmd5x_func = self._extract_cmd5x_function(webpage, video_id) | |
# bid 0 as an initial format checker | |
dash_paths = self._parse_json(PhantomJSwrapper(self).get(url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=""" | |
console.log(page.evaluate(function() { | |
var tvid = "%s"; var vid = "%s"; var dfp = "%s"; var bid_list = %s; var tm = new Date().getTime(); | |
var cmd5x_func = %s; var cmd5x_exporter = {}; cmd5x_func({}, cmd5x_exporter, {}); var cmd5x = cmd5x_exporter.cmd5x; | |
var authKey = cmd5x(cmd5x('') + tm + '' + tvid); | |
var k_uid = Array.apply(null, Array(32)).map(function() {return Math.floor(Math.random() * 15).toString(16)}).join(''); | |
var dash_paths = {}; | |
bid_list.forEach(function(bid) { | |
var query = { | |
'tvid': tvid, | |
'bid': bid, | |
'ds': 1, | |
'vid': vid, | |
'src': '01010031010018000000', | |
'vt': 0, | |
'rs': 1, | |
'uid': 0, | |
'ori': 'pcw', | |
'ps': 1, | |
'k_uid': k_uid, | |
'pt': 0, | |
'd': 0, | |
's': '', | |
'lid': '', | |
'slid': 0, | |
'cf': '', | |
'ct': '', | |
'authKey': authKey, | |
'k_tag': 1, | |
'ost': 0, | |
'ppt': 0, | |
'dfp': dfp, | |
'prio': JSON.stringify({ | |
'ff': 'f4v', | |
'code': 2 | |
}), | |
'k_err_retries': 0, | |
'up': '', | |
'su': 2, | |
'applang': 'en_us', | |
'sver': 2, | |
'X-USER-MODE': 'id', | |
'qd_v': 2, | |
'tm': tm, | |
'qdy': 'a', | |
'qds': 0, | |
'k_ft1': 141287244169348, | |
'k_ft4': 34359746564, | |
'k_ft5': 1, | |
'bop': JSON.stringify({ | |
'version': '10.0', | |
'dfp': dfp | |
}), | |
'ut': 0, | |
}; | |
var enc_params = []; | |
for (var prop in query) { | |
enc_params.push(encodeURIComponent(prop) + '=' + encodeURIComponent(query[prop])); | |
} | |
var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path); | |
dash_paths[bid] = dash_path; | |
}); | |
return JSON.stringify(dash_paths); | |
})); | |
saveAndExit(); | |
""" % (tvid, vid, dfp, js_bid_list, cmd5x_func))[1].strip(), video_id) | |
formats, subtitles = [], {} | |
initial_format_data = self._download_json( | |
urljoin('https://cache-video.iq.com', dash_paths['0']), video_id, | |
note=f'Downloading initial video format info', errnote='Unable to download initial video format info')['data'] | |
preview_time = traverse_obj(initial_format_data, ('boss_ts', 'data', 'previewTime'), expected_type=float_or_none) | |
if preview_time: | |
self.report_warning(f'This preview video is limited to {preview_time} seconds') | |
for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none)): | |
dash_path = dash_paths.get(bid) | |
if not dash_path: | |
self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted') | |
continue | |
format_data = traverse_obj(self._download_json( | |
urljoin('https://cache-video.iq.com', dash_path), video_id, | |
note=f'Downloading format data for {self._BID_TAGS[bid]}', errnote='Unable to download format data', | |
fatal=False), 'data', expected_type=dict) | |
video_format = next((video_format for video_format in traverse_obj( | |
format_data, ('program', 'video', ...), expected_type=dict) if str(video_format['bid']) == bid), {}) | |
extracted_formats = [] | |
if video_format.get('m3u8Url'): | |
extracted_formats = self._extract_m3u8_formats( | |
urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['m3u8Url']), 'mp4', m3u8_id=bid) | |
if video_format.get('mpdUrl'): | |
# TODO: Properly extract mpd hostname | |
extracted_formats = self._extract_mpd_formats( | |
urljoin(format_data.get('dm', 'http://meta.video.iqiyi.com'), video_format['mpdUrl']), mpd_id=bid) | |
if video_format.get('m3u8'): | |
ff = video_format.get('ff', 'ts') | |
if ff == 'ts': | |
extracted_formats, _ = self._parse_m3u8_formats_and_subtitles( | |
video_format['m3u8'], 'data:application/x-mpegurl;base64,' + base64.b64encode(video_format['m3u8'].encode('utf-8')).decode('ascii'), | |
'mp4', m3u8_id=bid) | |
elif ff == 'm4s': | |
extracted_formats, _ = self._parse_mpd_formats_and_subtitles( | |
video_format['m3u8'], bid, format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), | |
'data:application/dash+xml;base64,' + base64.b64encode(video_format['m3u8'].encode('utf-8')).decode('ascii')) | |
else: | |
self.report_warning(f'{ff} formats are currently not supported') | |
if not extracted_formats: | |
self.report_warning(f'Unable to extract video format for {self._BID_TAGS[bid]}') | |
for f in extracted_formats: | |
f.update({ | |
'quality': qualities(list(self._BID_TAGS.keys()))(bid), | |
'format_note': self._BID_TAGS[bid], | |
**parse_resolution(video_format.get('scrsz')) | |
}) | |
formats.extend(extracted_formats) | |
self._sort_formats(formats) | |
for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict): | |
subtitles.setdefault(self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')), []).extend([{ | |
'ext': format_ext, | |
'url': urljoin(format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key]) | |
} for format_key, format_ext in [('srt', 'srt'), ('webvtt', 'vtt')] if sub_format.get(format_key)]) | |
return { | |
'id': video_id, | |
'title': video_info['name'], | |
'description': video_info.get('merge_desc'), | |
'duration': parse_duration(video_info.get('len')), | |
'age_limit': parse_age_limit(video_info.get('rating')), | |
'average_rating': traverse_obj(page_data, ('playScoreInfo', 'score'), expected_type=float_or_none), | |
'timestamp': parse_iso8601(video_info.get('isoUploadDate')), | |
'categories': traverse_obj(page_data, ('album', 'videoAlbumInfo', 'videoTagMap', ..., ..., 'name'), expected_type=str_or_none), | |
'cast': traverse_obj(page_data, ('album', 'videoAlbumInfo', 'actorArr', ..., 'name'), expected_type=str_or_none), | |
'series': video_info.get('album_name'), | |
'formats': formats, | |
'subtitles': subtitles, | |
} | |
class IqAlbumIE(InfoExtractor): | |
IE_NAME = 'iq.com:album' | |
_VALID_URL = r'https?://(?:www\.)?iq\.com/album/(?:[\w-]*-)?(?P<id>\w+)' | |
def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', lang_code='en_us'): | |
for page_range in page_ranges: | |
page = self._download_json(f'https://pcw-api.iq.com/api/episodeListSource/{album_id_num}', album_id, | |
note=f'Downloading video list episodes {page_range.get("msg", "")}', | |
errnote='Unable to download video list', query={ | |
'platformId': 3, | |
'modeCode': mode_code, | |
'langCode': lang_code, | |
'endOrder': page_range['to'], | |
'startOrder': page_range['from'] | |
}) | |
for video in page['data']['epg']: | |
yield self.url_result('https://www.iq.com/play/' + video['playLocSuffix'], IqIE.ie_key(), video.get('qipuIdStr'), video.get('name')) | |
def _real_extract(self, url): | |
album_id = self._match_id(url) | |
webpage = self._download_webpage(url, album_id) | |
next_data = self._search_nextjs_data(webpage, album_id) | |
album_data = next_data['props']['initialState']['album']['videoAlbumInfo'] | |
return self.playlist_result( | |
self._entries(album_data['albumId'], album_data['totalPageRange'], album_id, | |
traverse_obj(next_data, ('props', 'initialProps', 'pageProps','modeCode')), | |
traverse_obj(next_data, ('props', 'initialProps', 'pageProps','langCode'))), album_id, album_data.get('name'), album_data.get('desc')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment