Last active
May 22, 2021 08:06
-
-
Save pansila/0868305f0dbf6ba817264e5bfea0e0aa to your computer and use it in GitHub Desktop.
Download srt and video file from CNN offical website for subs2srs to make Anki cards.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import requests | |
import shutil | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
from vtt_to_srt import vtt_to_srt | |
OUTPUT_DIR = 'output' | |
OUTPUT_TMP = os.path.join('output', 'tmp') | |
try: | |
os.mkdir(OUTPUT_DIR) | |
except FileExistsError: | |
pass | |
try: | |
os.mkdir(OUTPUT_TMP) | |
except FileExistsError: | |
pass | |
def m3u8Downloader(m3u8): | |
with open(m3u8, 'r') as f: | |
tslist = [line.rstrip() for line in f if line.rstrip().endswith('.ts?null=0')] | |
if len(tslist) > 0: | |
print('Total '+ str(len(tslist)) +' files') | |
else: | |
print('No ts file found.') | |
return -1 | |
file, ext = os.path.splitext(os.path.basename(m3u8)) | |
videoName = os.path.join(OUTPUT_DIR, file + '.mp4') | |
index = 1 | |
tsNames = [] | |
for tsUrl in tslist: | |
videoNameTmp = file + '_' + str(index) + '.ts' | |
tsFile = os.path.join(OUTPUT_TMP, videoNameTmp) | |
if not os.path.exists(tsFile): | |
res = requests.get(tsUrl, stream=True, headers={'Referer':'https://cnnios-f.akamaihd.net'}) | |
if res.status_code == 200: | |
with open(tsFile, 'wb') as f: | |
for chunk in res: | |
f.write(chunk) | |
print(videoNameTmp + ' downloaded\r',) | |
else: | |
print('\nConnection error for url {}: {}'.format(tsUrl, res.status_code)) | |
#return -1 | |
tsNames.append(videoNameTmp) | |
index += 1 | |
if index >= len(tslist): | |
with open(videoName, 'wb') as f: | |
for ts in tsNames: | |
tsFile = os.path.join(OUTPUT_TMP, ts) | |
with open(tsFile, 'rb') as mergefile: | |
shutil.copyfileobj(mergefile, f) | |
os.remove(tsFile) | |
print(videoName + ' merged.') | |
else: | |
print('Merge failed, missing files.') | |
return -1 | |
return 0 | |
def getM3u8(url): | |
parsed_tuple = urlparse(url) | |
name, m3u8Name = parsed_tuple.path.split('/')[-2:] | |
if name.endswith('.mp4.csmil'): | |
m3u8Name = name.split('.')[0] + '.m3u8' | |
m3u8Path = os.path.join(OUTPUT_DIR, m3u8Name) | |
res = requests.get(url) | |
if res.status_code == 200: | |
#soup = BeautifulSoup(res.text, "html.parser") | |
#m3u8Url = str(soup.source['src']) | |
m3u8Url = url | |
res = requests.get(m3u8Url, stream=True) | |
if res.status_code == 200: | |
with open(m3u8Path, 'wb') as f: | |
for chunk in res: | |
f.write(chunk) | |
print(m3u8Name + ' downloaded') | |
else: | |
return None | |
else: | |
return None | |
return m3u8Path | |
def format_srt(srt_file): | |
LineList = [] | |
timestamp_line = False | |
output_srt_path = srt_file.split('.')[0] + '.srt' | |
with open(srt_file, 'r', encoding="utf-8") as srtfile: | |
for line in srtfile: | |
if len(line) > 4: | |
# remove audio only subtitles | |
line = line.replace('{\\an8}', '').replace('."', '".') | |
# remove audio only subtitles | |
if '[' in line and ']' in line: | |
continue | |
if '-->' in line: | |
# remove time range only subtitles | |
if timestamp_line: | |
LineList.pop(-1) | |
timestamp_line = True | |
else: | |
# merge lines that are broken up into multilines | |
if not timestamp_line: | |
LineList[-1] += ' ' + line.strip() | |
continue | |
timestamp_line = False | |
LineList.append(line.rstrip()) | |
results = [] | |
clauses = [] | |
lineNum = 1 | |
time_range_start = None | |
time_range_end = None | |
for line in LineList: | |
if ' --> ' in line: | |
start, end = line.split(' --> ') | |
if time_range_start is None: | |
time_range_start = start | |
time_range_end = end | |
continue | |
clauses.append(line) | |
if line.endswith('.') or line.endswith('?') or line.endswith('!'): | |
time_range = time_range_start + ' --> ' + time_range_end | |
time_range_start = None | |
time_range_end = None | |
results.append(str(lineNum)) | |
results.append(time_range) | |
results.append(' '.join(clauses)) | |
results.append('') | |
lineNum += 1 | |
clauses = [] | |
with open(output_srt_path, 'w', encoding="utf-8") as srtfileout: | |
srtfileout.write('\n'.join(results)) | |
print('Formatted srt file is saved to "{}"'.format(output_srt_path)) | |
def m3u8url_to_vtturl(url): | |
""" | |
https://pmd.cdn.turner.com/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios__en.vtt | |
https://cnnios-f.akamaihd.net/i/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios_,440,650,840,1240,3000,5500,.mp4.csmil/index_1_av.m3u8?null=0 | |
""" | |
target = url.split(',')[0].split('/')[4:] | |
vttURL = 'https://pmd.cdn.turner.com/{}_en.vtt'.format('/'.join(target)) | |
return vttURL | |
def getVTT(vttURL): | |
vttName = vttURL.split('/')[-1] | |
vttPath = os.path.join(OUTPUT_DIR, vttName) | |
res = requests.get(vttURL, stream=True) | |
if res.status_code == 200: | |
with open(vttPath, 'wb') as f: | |
for chunk in res: | |
f.write(chunk) | |
print(vttName + ' downloaded') | |
else: | |
return None | |
return vttPath | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('m3u8URL', help='the m3u8 URL') | |
args = parser.parse_args() | |
m3u8 = getM3u8(args.m3u8URL) | |
vttURL = m3u8url_to_vtturl(args.m3u8URL) | |
vtt = getVTT(vttURL) | |
vtt_to_srt(vtt) | |
srt_file = os.path.splitext(vtt)[0] + '.srt' | |
format_srt(srt_file) | |
m3u8Downloader(m3u8) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Prerequisites:
python3
install dependencies
Usage:
python cnn_10_maker.py https://cnnios-f.akamaihd.net/i/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios_,440,650,840,1240,3000,5500,.mp4.csmil/index_1_av.m3u8?null=0