Skip to content

Instantly share code, notes, and snippets.

@MinePlayersPE
Last active January 24, 2022 10:51
Show Gist options
  • Save MinePlayersPE/589edb2e424f272bf82e3eaa4beb6130 to your computer and use it in GitHub Desktop.
Save MinePlayersPE/589edb2e424f272bf82e3eaa4beb6130 to your computer and use it in GitHub Desktop.
iq.com yt-dlp extractor plugin (OUTDATED, SEE IqIE IN https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/iqiyi.py INSTEAD)
# coding: utf-8
from __future__ import unicode_literals
import base64
import json
import random
# ⚠ Don't use relative imports
from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.extractor.openload import PhantomJSwrapper
from yt_dlp.utils import (
ExtractorError,
float_or_none,
join_nonempty,
js_to_json,
parse_age_limit,
parse_duration,
parse_iso8601,
parse_resolution,
qualities,
str_or_none,
traverse_obj,
urljoin
)
# ℹ️ Instructions on making extractors can be found at:
# 🔗 https://github.com/ytdl-org/youtube-dl#adding-support-for-a-new-site
class IqIE(InfoExtractor):
IE_NAME = 'iq.com'
IE_DESC = 'International version of iQiyi'
_VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w-]*-)?(?P<id>\w+)'
_BID_TAGS = {
'100': '240P',
'200': '360P',
'300': '480P',
'500': '720P',
'600': '1080P',
'610': '1080P50',
'700': '2K',
'800': '4K',
}
_LID_TAGS = {
'1': 'zh_CN',
'2': 'zh_TW',
'3': 'en',
'18': 'th',
'21': 'my',
'23': 'vi',
'24': 'id',
'26': 'es',
'28': 'ar',
}
def _extract_vms_player_js(self, webpage, video_id):
player_js_cache = self._downloader.cache.load('iq', 'player_js')
if player_js_cache:
return player_js_cache
webpack_js_url = self._proto_relative_url(self._search_regex(r'<script src="([\w\./]+/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL'))
webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS')
webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex(
r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1,2))]
for module_index in reversed(webpack_map2.keys()):
module_js = self._download_webpage(
f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js',
video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or ''
if 'vms request' in module_js:
self._downloader.cache.store('iq', 'player_js', module_js)
return module_js
raise ExtractorError('Unable to extract player JS')
def _extract_cmd5x_function(self, webpage, video_id):
return self._search_regex(r',\s*(function\s*\([^\)]*\)\s*{\s*var _qda.+_qdc\(\)\s*})\s*,',
self._extract_vms_player_js(webpage, video_id), 'signature function')
def _update_bid_tags(self, webpage, video_id):
extracted_bid_tags = self._parse_json(
self._search_regex(
r'arguments\[1\][^,]*,\s*function\(\w,\w,\w\)\s*{\s*"use strict";\s*var \w=({.+}})\s*,\s*\w\s*=\s*{getNewVd',
self._extract_vms_player_js(webpage, video_id), 'video tags', default=''),
video_id, transform_source=js_to_json, fatal=False)
if not extracted_bid_tags:
return
self._BID_TAGS = {bid: extracted_bid_tags[bid]['value'] for bid in extracted_bid_tags.keys()}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
self._update_bid_tags(webpage, video_id)
page_data = self._search_nextjs_data(webpage, video_id)['props']['initialState']['play']
video_info = page_data['curVideoInfo']
tvid, vid = video_info['tvId'], video_info['vid']
dfp = self._get_cookies('iq.com')['__dfp'].value if self._get_cookies('https://iq.com').get('__dfp') else ''
js_bid_list = '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']'
cmd5x_func = self._extract_cmd5x_function(webpage, video_id)
# bid 0 as an initial format checker
dash_paths = self._parse_json(PhantomJSwrapper(self).get(url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode="""
console.log(page.evaluate(function() {
var tvid = "%s"; var vid = "%s"; var dfp = "%s"; var bid_list = %s; var tm = new Date().getTime();
var cmd5x_func = %s; var cmd5x_exporter = {}; cmd5x_func({}, cmd5x_exporter, {}); var cmd5x = cmd5x_exporter.cmd5x;
var authKey = cmd5x(cmd5x('') + tm + '' + tvid);
var k_uid = Array.apply(null, Array(32)).map(function() {return Math.floor(Math.random() * 15).toString(16)}).join('');
var dash_paths = {};
bid_list.forEach(function(bid) {
var query = {
'tvid': tvid,
'bid': bid,
'ds': 1,
'vid': vid,
'src': '01010031010018000000',
'vt': 0,
'rs': 1,
'uid': 0,
'ori': 'pcw',
'ps': 1,
'k_uid': k_uid,
'pt': 0,
'd': 0,
's': '',
'lid': '',
'slid': 0,
'cf': '',
'ct': '',
'authKey': authKey,
'k_tag': 1,
'ost': 0,
'ppt': 0,
'dfp': dfp,
'prio': JSON.stringify({
'ff': 'f4v',
'code': 2
}),
'k_err_retries': 0,
'up': '',
'su': 2,
'applang': 'en_us',
'sver': 2,
'X-USER-MODE': 'id',
'qd_v': 2,
'tm': tm,
'qdy': 'a',
'qds': 0,
'k_ft1': 141287244169348,
'k_ft4': 34359746564,
'k_ft5': 1,
'bop': JSON.stringify({
'version': '10.0',
'dfp': dfp
}),
'ut': 0,
};
var enc_params = [];
for (var prop in query) {
enc_params.push(encodeURIComponent(prop) + '=' + encodeURIComponent(query[prop]));
}
var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path);
dash_paths[bid] = dash_path;
});
return JSON.stringify(dash_paths);
}));
saveAndExit();
""" % (tvid, vid, dfp, js_bid_list, cmd5x_func))[1].strip(), video_id)
formats, subtitles = [], {}
initial_format_data = self._download_json(
urljoin('https://cache-video.iq.com', dash_paths['0']), video_id,
note=f'Downloading initial video format info', errnote='Unable to download initial video format info')['data']
preview_time = traverse_obj(initial_format_data, ('boss_ts', 'data', 'previewTime'), expected_type=float_or_none)
if preview_time:
self.report_warning(f'This preview video is limited to {preview_time} seconds')
for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none)):
dash_path = dash_paths.get(bid)
if not dash_path:
self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted')
continue
format_data = traverse_obj(self._download_json(
urljoin('https://cache-video.iq.com', dash_path), video_id,
note=f'Downloading format data for {self._BID_TAGS[bid]}', errnote='Unable to download format data',
fatal=False), 'data', expected_type=dict)
video_format = next((video_format for video_format in traverse_obj(
format_data, ('program', 'video', ...), expected_type=dict) if str(video_format['bid']) == bid), {})
extracted_formats = []
if video_format.get('m3u8Url'):
extracted_formats = self._extract_m3u8_formats(
urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['m3u8Url']), 'mp4', m3u8_id=bid)
if video_format.get('mpdUrl'):
# TODO: Properly extract mpd hostname
extracted_formats = self._extract_mpd_formats(
urljoin(format_data.get('dm', 'http://meta.video.iqiyi.com'), video_format['mpdUrl']), mpd_id=bid)
if video_format.get('m3u8'):
ff = video_format.get('ff', 'ts')
if ff == 'ts':
extracted_formats, _ = self._parse_m3u8_formats_and_subtitles(
video_format['m3u8'], 'data:application/x-mpegurl;base64,' + base64.b64encode(video_format['m3u8'].encode('utf-8')).decode('ascii'),
'mp4', m3u8_id=bid)
elif ff == 'm4s':
extracted_formats, _ = self._parse_mpd_formats_and_subtitles(
video_format['m3u8'], bid, format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'),
'data:application/dash+xml;base64,' + base64.b64encode(video_format['m3u8'].encode('utf-8')).decode('ascii'))
else:
self.report_warning(f'{ff} formats are currently not supported')
if not extracted_formats:
self.report_warning(f'Unable to extract video format for {self._BID_TAGS[bid]}')
for f in extracted_formats:
f.update({
'quality': qualities(list(self._BID_TAGS.keys()))(bid),
'format_note': self._BID_TAGS[bid],
**parse_resolution(video_format.get('scrsz'))
})
formats.extend(extracted_formats)
self._sort_formats(formats)
for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict):
subtitles.setdefault(self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')), []).extend([{
'ext': format_ext,
'url': urljoin(format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key])
} for format_key, format_ext in [('srt', 'srt'), ('webvtt', 'vtt')] if sub_format.get(format_key)])
return {
'id': video_id,
'title': video_info['name'],
'description': video_info.get('merge_desc'),
'duration': parse_duration(video_info.get('len')),
'age_limit': parse_age_limit(video_info.get('rating')),
'average_rating': traverse_obj(page_data, ('playScoreInfo', 'score'), expected_type=float_or_none),
'timestamp': parse_iso8601(video_info.get('isoUploadDate')),
'categories': traverse_obj(page_data, ('album', 'videoAlbumInfo', 'videoTagMap', ..., ..., 'name'), expected_type=str_or_none),
'cast': traverse_obj(page_data, ('album', 'videoAlbumInfo', 'actorArr', ..., 'name'), expected_type=str_or_none),
'series': video_info.get('album_name'),
'formats': formats,
'subtitles': subtitles,
}
class IqAlbumIE(InfoExtractor):
IE_NAME = 'iq.com:album'
_VALID_URL = r'https?://(?:www\.)?iq\.com/album/(?:[\w-]*-)?(?P<id>\w+)'
def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', lang_code='en_us'):
for page_range in page_ranges:
page = self._download_json(f'https://pcw-api.iq.com/api/episodeListSource/{album_id_num}', album_id,
note=f'Downloading video list episodes {page_range.get("msg", "")}',
errnote='Unable to download video list', query={
'platformId': 3,
'modeCode': mode_code,
'langCode': lang_code,
'endOrder': page_range['to'],
'startOrder': page_range['from']
})
for video in page['data']['epg']:
yield self.url_result('https://www.iq.com/play/' + video['playLocSuffix'], IqIE.ie_key(), video.get('qipuIdStr'), video.get('name'))
def _real_extract(self, url):
album_id = self._match_id(url)
webpage = self._download_webpage(url, album_id)
next_data = self._search_nextjs_data(webpage, album_id)
album_data = next_data['props']['initialState']['album']['videoAlbumInfo']
return self.playlist_result(
self._entries(album_data['albumId'], album_data['totalPageRange'], album_id,
traverse_obj(next_data, ('props', 'initialProps', 'pageProps','modeCode')),
traverse_obj(next_data, ('props', 'initialProps', 'pageProps','langCode'))), album_id, album_data.get('name'), album_data.get('desc'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment