Created
December 17, 2024 18:41
-
-
Save maciekish/23dc508d00ed0700e7a1d51c70eefb21 to your computer and use it in GitHub Desktop.
Delete duplicate external subtitles if embedded subtitles exist
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Check if argument provided | |
if [ -z "$1" ]; then | |
echo "Usage: $0 /path/to/media" | |
exit 1 | |
fi | |
BASE_DIR="$1" | |
# Mapping from 3-letter embedded codes to 2-letter external codes | |
declare -A EMBEDDED_TO_TWO_LETTER=( | |
[eng]="en" | |
[spa]="es" | |
[ita]="it" | |
[jpn]="ja" | |
[fra]="fr" | |
[ger]="de" | |
[por]="pt" | |
[rus]="ru" | |
[chi]="zh" | |
[kor]="ko" | |
[swe]="sv" | |
[pol]="pl" | |
) | |
# Find all mkv or mp4 files recursively | |
# Use `-print0` and `read -d ''` to properly handle filenames with spaces | |
find "$BASE_DIR" -type f \( -iname "*.mkv" -o -iname "*.mp4" \) -print0 | while IFS= read -r -d '' media_file; do | |
# Get the directory containing this media file | |
dir="$(dirname "$media_file")" | |
filename="$(basename "$media_file")" | |
base_name="${filename%.*}" | |
# Move into that directory | |
cd "$dir" || continue | |
# Extract JSON info about the file | |
json_info=$(ffprobe -v quiet -print_format json -show_streams "$filename") | |
# Get all embedded 3-letter language codes from subtitles | |
embedded_langs_3=$(echo "$json_info" | jq -r '.streams[] | select(.codec_type == "subtitle") | .tags.language' 2>/dev/null) | |
# Convert embedded languages from 3-letter to 2-letter | |
embedded_langs_2=() | |
for lang3 in $embedded_langs_3; do | |
if [ -n "${EMBEDDED_TO_TWO_LETTER[$lang3]}" ]; then | |
embedded_langs_2+=("${EMBEDDED_TO_TWO_LETTER[$lang3]}") | |
else | |
# If code not in map, we skip it | |
continue | |
fi | |
done | |
declare -A embedded_map | |
for l2 in "${embedded_langs_2[@]}"; do | |
embedded_map["$l2"]=1 | |
done | |
# Process all external subtitles (srt/ass/vtt) that match the base name | |
for subfile in "$base_name".*.srt "$base_name".*.ass "$base_name".*.vtt; do | |
# Check if file exists (the pattern may fail) | |
[ -e "$subfile" ] || continue | |
# Extract the two-letter language code from something like base_name.en.srt | |
lang_code=$(echo "$subfile" | sed -E 's/.*\.([a-zA-Z0-9]{2})\.[^.]+$/\1/') | |
# If this two-letter code matches an embedded language, disable it | |
if [ -n "${embedded_map[$lang_code]}" ]; then | |
echo "Disabling external subtitle: $subfile" | |
mv "$subfile" "$subfile.disabled" | |
fi | |
done | |
# Clean up | |
unset embedded_map | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment