Last active
July 24, 2025 23:08
-
-
Save kelvinauta/0561842fc9a7e138cd166c42fdd5f4bc to your computer and use it in GitHub Desktop.
Download the video -> Transcribe it using OpenAI Whisper-1 -> Translate the subtitle lines in parallel -> Create an mkv with the subtitles
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -euo pipefail | |
OUTPUT_LANG="español" | |
CHUNK_SIZE=50 | |
MODEL_TRANSLATE="gpt-4.1-mini" | |
function usage(){ | |
cat <<EOF | |
Usage: $(basename "$0") <youtube_url> [temp_dir] | |
Downloads a YouTube video, extracts its audio, obtains subtitles with OpenAI Whisper, translates them into "${OUTPUT_LANG}" using the model "${MODEL_TRANSLATE}", and finally muxes the translated subtitles back into the video producing an .mkv file. | |
Positional arguments: | |
youtube_url URL of the YouTube video you want to process (mandatory) | |
temp_dir Optional directory to place intermediate files. When omitted a random directory will be created under /tmp. | |
Options: | |
-h, --help Show this help message and exit. | |
Environment: | |
OPENAI_API_KEY Your OpenAI secret key. It must be exported before running this script. | |
Dependencies: yt-dlp, ffmpeg, jq, curl, sed, mktemp, tr | |
EOF | |
} | |
if [[ $# -eq 0 ]]; then echo "No arguments provided, use --help"; exit 1; fi | |
if [[ -z $OPENAI_API_KEY ]]; then echo "Env required: OPENAI_API_KEY. use --help"; exit 1;fi | |
([ "$1" = -h ] || [ "$1" = --help ]) && usage && exit 0 | |
yt_url="$1" | |
shift || true | |
dir_tmp="${1:-$(mktemp -u "/tmp/video.XXXX")}" | |
format="mp4" | |
function setup(){ | |
echo "init setup" | |
mkdir -p "$dir_tmp" | |
title="$(yt-dlp --print title "$yt_url" | tr '[:upper:]' '[:lower:]' | tr --delete "\n" | tr " " "-" | tr -s "-" | tr -cd "A-Za-z0-9-")" | |
tmp_video_path="${dir_tmp}/${title}.${format}" | |
echo "done setup: ${tmp_video_path}" | |
} | |
function download(){ | |
echo "init download" | |
yt-dlp -f bestvideo+bestaudio --merge-output-format "$format" -o "$tmp_video_path" "$yt_url" | |
echo "init download video" | |
} | |
tmp_audio_path="${dir_tmp}/audio.ogg" | |
function extract_audio(){ | |
echo "init extract audio" | |
ffmpeg -loglevel quiet -i "$tmp_video_path" -vn -map_metadata -1 -ac 1 -c:a libopus -b:a 12k -application voip "$tmp_audio_path" | |
echo "done extract audio ${tmp_audio_path}" | |
} | |
tmp_subtitle_path="${dir_tmp}/subtitle.srt" | |
function transcribe(){ | |
echo "init transcribe" | |
curl -sSf -o "$tmp_subtitle_path" https://api.openai.com/v1/audio/transcriptions \ | |
-H "Authorization: Bearer $OPENAI_API_KEY" \ | |
-H "Content-Type: multipart/form-data" \ | |
-F file="@${tmp_audio_path}" \ | |
-F model="whisper-1" \ | |
-F response_format="srt" | |
echo "done transcribe ${tmp_subtitle_path}" | |
} | |
function translate_line(){ | |
text="$(sed '1,2d' <<< "$1" | tr '\n' ' ')" | |
file="$2" | |
line=$(( $3 + 3 )) | |
body=$(jq -n \ | |
--arg text "$text" \ | |
--arg lang "$OUTPUT_LANG" \ | |
--arg model "$MODEL_TRANSLATE" \ | |
'{ | |
model: $model, | |
messages: [ | |
{ role: "developer", | |
content: "Translate the text below into \($lang), your answer must be only and exclusively the translation, respecting the original content in a single line" | |
}, | |
{ role: "user", content: $text } | |
] | |
}') | |
local response=$(curl -sS https://api.openai.com/v1/chat/completions \ | |
-H "Content-Type: application/json" \ | |
-H "Authorization: Bearer $OPENAI_API_KEY" \ | |
-d "$body" | jq '.choices[0].message.content') | |
echo -e "\n" | |
echo "$text" | |
if [[ -n $response ]]; then | |
local output_text=$(tr "\n" " " <<<"$response") | |
echo "$output_text" | |
sed -i "${line}s/.*/${output_text}/" "$file" | |
else | |
echo "line ${line} fail in translate" | |
fi | |
} | |
function translate_all(){ | |
echo "init translate velocity chunks ${CHUNK_SIZE}" | |
index_line=0 | |
steps=0 | |
pids=() | |
while IFS= read -r -d '' block; do | |
text=$(sed '${/^\s*$/d;}' <<<"$block") | |
offset=$(( $(wc -l <<<"$text") + 1)) | |
translate_line "$text" "$tmp_subtitle_path" "$index_line" & | |
pids+=("$!") | |
steps=$(( $steps + 1 )) | |
[[ $steps -ge $CHUNK_SIZE ]] && { wait "${pids[@]}"; pids=(); } | |
index_line=$(( $index_line + $offset )) | |
done< <( | |
sed -z 's/\n[[:space:]]*\n/\n\x00/g' "$tmp_subtitle_path" | |
) | |
wait "${pids[@]}" | |
echo "done translate" | |
} | |
function make_mkv(){ | |
echo "init make mkv" | |
tmp_output_path="${dir_tmp}/${title}.mkv" | |
ffmpeg -loglevel quiet -i "$tmp_video_path" -sub_charenc UTF-8 -i "$tmp_subtitle_path" \ | |
-c:v copy -c:a copy -c:s srt \ | |
-metadata:s:s:0 language=spa \ | |
"$tmp_output_path" | |
echo "done make mkv, outputfile: ${tmp_output_path}" | |
} | |
setup | |
download | |
extract_audio | |
transcribe | |
translate_all | |
make_mkv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment