Last active
June 26, 2024 17:34
-
-
Save oozoofrog/02aedba77bb9403731bd28be6ccdb39f to your computer and use it in GitHub Desktop.
WWDC 2024 download video and subtitle and merge and translation script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import requests | |
import yt_dlp | |
import openai | |
def get_best_format(m3u8_url): | |
ydl_opts = { | |
'listformats': True, | |
} | |
formats = [] | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
result = ydl.extract_info(m3u8_url, download=False) | |
formats = result.get('formats', []) | |
# Find the best quality format based on resolution, FPS, and TBR | |
best_format = max(formats, key=lambda x: ( | |
x.get('height', 0), | |
x.get('fps', 0), | |
x.get('tbr', 0) | |
)) | |
return best_format['format_id'] | |
def get_webvtt_format(m3u8_url): | |
ydl_opts = { | |
'listformats': True, | |
} | |
formats = [] | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
result = ydl.extract_info(m3u8_url, download=False) | |
formats = result.get('formats', []) | |
return formats[0]['format_id'] | |
def fetch_informations(wwdc_url): | |
response = requests.get(wwdc_url) | |
if response.status_code != 200: | |
raise Exception(f"Failed to fetch WWDC page: {response.status_code}") | |
# print(response.text) | |
# Parse the page content to find video and subtitle URLs | |
# This is a basic example and may need adjustments based on the actual HTML structure | |
content = response.text | |
# content id is last path of the url | |
content_id = wwdc_url.split('/')[-1] | |
content_uuid = "" | |
title = "" | |
video_url = "" | |
subtitle_url = "" | |
if '<title>' in content: | |
title = content.split('<title>')[1].split('</title>')[0] | |
# find hd mp4 link | |
if '.mp4' in content: | |
video_url = content.split('hd.mp4')[0].split('"')[-1] + "hd.mp4" | |
content_uuid = video_url.split('/')[-3] | |
# subtitle url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10118/4/16FC914B-F442-41A4-AFF4-5047A3FF7125/subtitles/eng/prog_index.m3u8 | |
# video url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10171/4/7E1A626A-DE4F-4DEB-A2D9-ECCAAD10A34F/downloads/wwdc2024-10171_hd.mp4 | |
subtitle_url = video_url.split('downloads')[0] + 'subtitles/eng/prog_index.m3u8' | |
if not video_url: | |
raise Exception("Could not find video in the page content") | |
return content_id, content_uuid, title, video_url, subtitle_url | |
def download_video(video_url, output_filename): | |
# get best format for video | |
# format_id = get_best_format(video_url) | |
ydl_opts = { | |
# format is the best format id | |
# 'format': format_id, | |
'format': 'best', | |
# output file is content_id_title.extension | |
# title is all lowercaseed and spaces are replaced with underscores | |
'outtmpl': f'{output_filename}.%(ext)s', | |
'overwrites': True, | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
# ydl.download([video_url]) | |
# download video and return destination file | |
info_dict = ydl.extract_info(video_url, download=True) | |
filepath = info_dict['requested_downloads'][0]['filepath'] | |
print(f"Downloaded video to: {filepath}") | |
return filepath | |
def download_subtitle(m3u8_url, output_filename): | |
import os | |
import subprocess | |
# ffmpeg download subtitle | |
output_file = output_filename + "_en.srt" | |
output_path = os.path.join(os.getcwd(), output_file) | |
command = [ | |
'ffmpeg', | |
'-i', m3u8_url, | |
output_file, | |
# force to overwrite | |
'-y' | |
] | |
subprocess.run(command, check=True) | |
print(f"Downloaded subtitle to: {output_path}") | |
return output_path | |
def download_video_and_subtitle(opts, video_url, subtitle_url, output_filename): | |
downloaded_video_url = None | |
downloaded_subtitle_url = None | |
# if opts not none | |
if opts: | |
if opts == "--video": | |
downloaded_video_url = download_video(video_url, output_filename) | |
elif opts == "--subtitle": | |
downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename) | |
else: | |
downloaded_video_url = download_video(video_url, output_filename) | |
downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename) | |
else: | |
downloaded_video_url = download_video(video_url, output_filename) | |
downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename) | |
return downloaded_video_url, downloaded_subtitle_url | |
def merge_video_subtitle(video_file, subtitle_file, subtitle_ko_file): | |
import subprocess | |
# ffmpeg -i input.mp4 -i input.srt -c copy -c:s mov_text output.mp4 | |
output_file = video_file.split('.')[0] + '_subtitled.mp4' | |
if subtitle_ko_file: | |
command = [ | |
'ffmpeg', | |
'-i', video_file, | |
'-i', subtitle_file, | |
'-i', subtitle_ko_file, | |
'-c:v', 'copy', | |
'-c:a', 'copy', | |
'-c:s', 'mov_text', # Use mov_text codec for subtitles | |
'-metadata:s:s:0', 'language=eng', | |
'-metadata:s:s:1', 'language=kor', | |
output_file, | |
'-y' | |
] | |
else: | |
command = [ | |
'ffmpeg', | |
'-i', video_file, | |
'-i', subtitle_file, | |
'-c:v', 'copy', | |
'-c:a', 'copy', | |
'-c:s', 'mov_text', # Use mov_text codec for subtitles | |
'-metadata:s:s:0', 'language=eng', | |
output_file | |
] | |
subprocess.run(command, check=True) | |
print(f"Subtitled video saved to: {output_file}") | |
def translation_subtitle_to_ko(openai_key, subtitle_path): | |
from openai import OpenAI | |
translated_subtitle_path = subtitle_path.replace('_en.srt', '_ko.srt') | |
# invalid of openai key just announcement | |
if not openai_key or openai_key == "": | |
print("OpenAI key is not valid. ignore translation.") | |
return "" | |
with open(subtitle_path, encoding='utf-8') as file: | |
subtitles_str = file.read() | |
structured_subtitles = parse_srt(subtitles_str) | |
client = OpenAI(api_key=openai_key) | |
translated = "" | |
subtitles = [] | |
total = len(structured_subtitles) | |
progress = 0 | |
for subtitle_unit in structured_subtitles: | |
subtitles.append(subtitle_unit) | |
# if lenth of subtitles is 10, start translation | |
if len(subtitles) % 10 == 0: | |
content = translation_subtitle_to_openai(client, subtitles) | |
# trim ``` or ```srt | |
content = content.replace('```', '') | |
content = content.replace('```srt', '') | |
# trimming spaces all lines | |
content = '\n'.join([line.strip() for line in content.split('\n')]) | |
translated += content + "\n\n" | |
progress += len(subtitles) | |
subtitles = [] | |
print(f"Translated {progress}/{total} subtitles") | |
if len(subtitles) > 0: | |
# trim ``` or ```srt | |
content = content.replace('```', '') | |
content = content.replace('```srt', '') | |
# trimming spaces all lines | |
content = '\n'.join([line.strip() for line in content.split('\n')]) | |
translated += content + "\n\n" | |
progress += len(subtitles) | |
subtitles = [] | |
print(f"Translated {progress}/{total} subtitles") | |
# print(content) | |
# translation = content | |
# translated_subtitles.append(translation) | |
with open(translated_subtitle_path, 'w', encoding='utf-8') as file: | |
file.write(translated) | |
return translated_subtitle_path | |
def translation_subtitle_to_openai(client, subtitles): | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant that translates text from English to Korean."}, | |
] | |
content = f""" | |
Translate the following subtitles to Korean. Please write the technical terms or names in both Korean and English. Please ensure that unnecessary characters, such as "`" or strings like `srt`, are not added outside of the subtitle format. | |
""" | |
for subtitle in subtitles: | |
content += f""" | |
{subtitle["index"]} | |
{subtitle["start_time"]} --> {subtitle["end_time"]} | |
{subtitle["text"]} | |
""" | |
messages.append({"role": "user", "content": content}) | |
chat_completion = client.chat.completions.create( | |
messages=messages, | |
model="gpt-4o", | |
) | |
choices = chat_completion.choices | |
choice = choices[0] | |
return choice.message.content | |
def parse_srt(srt): | |
entries = srt.strip().split('\n\n') | |
structured_data = [] | |
for entry in entries: | |
lines = entry.split('\n') | |
index = int(lines[0]) | |
timing = lines[1] | |
start_time, end_time = timing.split(' --> ') | |
text = '\n'.join(lines[2:]) | |
structured_data.append({ | |
'index': index, | |
'start_time': start_time, | |
'end_time': end_time, | |
'text': text | |
}) | |
return structured_data | |
# opts is nullable | |
def main(wwdc_url, opts, openai_key): | |
try: | |
content_id, content_uuid, title, video_url, subtitle_url = fetch_informations(wwdc_url) | |
print(f"Content ID: {content_id}") | |
print(f"Content UUID: {content_uuid}") | |
print(f"Title: {title}") | |
print(f"Video URL: {video_url}") | |
print(f"Subtitle URL: {subtitle_url}") | |
filename = f'{content_id}_{title.lower().replace(" ", "_")}' | |
video_url, subtitle_url = download_video_and_subtitle(opts, video_url, subtitle_url, filename) | |
subtitle_ko_url = None | |
if subtitle_url: | |
# translation with openai | |
subtitle_ko_url = translation_subtitle_to_ko(openai_key, subtitle_url) | |
# video_url and subtitle_url is not none | |
if video_url and subtitle_url: | |
merge_video_subtitle(video_url, subtitle_url, subtitle_ko_url) | |
except Exception as e: | |
print(f"Error: {e}") | |
if __name__ == "__main__": | |
# Arguments | |
# --video or --subtitle for download only video or subtitle | |
# --openai_key=YOUR_OPENAI_KEY for openai key | |
import argparse | |
parser = argparse.ArgumentParser(description='Download WWDC video and subtitle') | |
parser.add_argument('wwdc_url', type=str, help='WWDC video URL') | |
parser.add_argument('--video', action='store_true', help='Download video only') | |
parser.add_argument('--subtitle', action='store_true', help='Download subtitle only') | |
parser.add_argument('--openai_key', type=str, help='OpenAI API key') | |
args = parser.parse_args() | |
if args.video: | |
opts = "--video" | |
elif args.subtitle: | |
opts = "--subtitle" | |
else: | |
opts = None | |
main(args.wwdc_url, opts, args.openai_key) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment