Skip to content

Instantly share code, notes, and snippets.

@maciekish
Created December 17, 2024 18:41
Show Gist options
  • Save maciekish/23dc508d00ed0700e7a1d51c70eefb21 to your computer and use it in GitHub Desktop.
Save maciekish/23dc508d00ed0700e7a1d51c70eefb21 to your computer and use it in GitHub Desktop.
Delete duplicate external subtitles if embedded subtitles exist
#!/usr/bin/env bash
# Check if argument provided
if [ -z "$1" ]; then
echo "Usage: $0 /path/to/media"
exit 1
fi
BASE_DIR="$1"
# Mapping from 3-letter embedded codes to 2-letter external codes
declare -A EMBEDDED_TO_TWO_LETTER=(
[eng]="en"
[spa]="es"
[ita]="it"
[jpn]="ja"
[fra]="fr"
[ger]="de"
[por]="pt"
[rus]="ru"
[chi]="zh"
[kor]="ko"
[swe]="sv"
[pol]="pl"
)
# Find all mkv or mp4 files recursively
# Use `-print0` and `read -d ''` to properly handle filenames with spaces
find "$BASE_DIR" -type f \( -iname "*.mkv" -o -iname "*.mp4" \) -print0 | while IFS= read -r -d '' media_file; do
# Get the directory containing this media file
dir="$(dirname "$media_file")"
filename="$(basename "$media_file")"
base_name="${filename%.*}"
# Move into that directory
cd "$dir" || continue
# Extract JSON info about the file
json_info=$(ffprobe -v quiet -print_format json -show_streams "$filename")
# Get all embedded 3-letter language codes from subtitles
embedded_langs_3=$(echo "$json_info" | jq -r '.streams[] | select(.codec_type == "subtitle") | .tags.language' 2>/dev/null)
# Convert embedded languages from 3-letter to 2-letter
embedded_langs_2=()
for lang3 in $embedded_langs_3; do
if [ -n "${EMBEDDED_TO_TWO_LETTER[$lang3]}" ]; then
embedded_langs_2+=("${EMBEDDED_TO_TWO_LETTER[$lang3]}")
else
# If code not in map, we skip it
continue
fi
done
declare -A embedded_map
for l2 in "${embedded_langs_2[@]}"; do
embedded_map["$l2"]=1
done
# Process all external subtitles (srt/ass/vtt) that match the base name
for subfile in "$base_name".*.srt "$base_name".*.ass "$base_name".*.vtt; do
# Check if file exists (the pattern may fail)
[ -e "$subfile" ] || continue
# Extract the two-letter language code from something like base_name.en.srt
lang_code=$(echo "$subfile" | sed -E 's/.*\.([a-zA-Z0-9]{2})\.[^.]+$/\1/')
# If this two-letter code matches an embedded language, disable it
if [ -n "${embedded_map[$lang_code]}" ]; then
echo "Disabling external subtitle: $subfile"
mv "$subfile" "$subfile.disabled"
fi
done
# Clean up
unset embedded_map
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment