Created
September 26, 2020 15:03
-
-
Save benkant/9a47b0d5bc66ed7d9a742a3883271d67 to your computer and use it in GitHub Desktop.
Extract closed caption text from a YouTube video
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Don't call directly, called by `subtitles.sh` | |
"""Given an srt file, clean it up as much as possible to look like prose.""" | |
import re | |
import sys | |
bad_words = ['-->', 'WEBVTT', 'Language: en', 'Kind: captions'] | |
def cleanline(raw_line): | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', raw_line) | |
return cleantext | |
vid_id = sys.argv[1] | |
with open(vid_id) as oldfile, open('newfile.txt', 'w') as newfile: | |
for line in oldfile: | |
if not any(bad_word in line for bad_word in bad_words): | |
line = cleanline(line) | |
newfile.write(line) | |
with open('newfile.txt') as result: | |
uniqlines = set(result.readlines()) | |
with open(f'sub_{vid_id}.txt', 'w') as rmdup: | |
mylst = map(lambda each: each.strip(">>"), uniqlines) | |
print(mylst) | |
rmdup.writelines(set(mylst)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Usage: | |
# ./subtitles.sh tc4ROCJYbm0 | |
# | |
# Invokes `subtitle-clean.py` | |
# NOTE: Downloads youtube-dl Python script if this is our first time | |
cd /tmp | |
if [ ! -f /tmp/youtube-dl ] | |
then | |
echo "Downloading youtube-dl script..." | |
curl -L -O https://github.com/ytdl-org/youtube-dl/releases/latest/download/youtube-dl | |
fi | |
if [ ! -x /tmp/youtube-dl ] | |
then | |
chmod +x /tmp/youtube-dl | |
fi | |
echo "Downloading subtitles..." | |
VIDEO=$1 | |
./youtube-dl --skip-download --convert-subs srt --write-auto-sub --id --sub-lang en $VIDEO | |
if [ ! -x ~/subtitle-clean.py ] | |
then | |
chmod +x ~/subtitle-clean.py | |
fi | |
~/subtitle-clean.py ${VIDEO}.en.vtt | |
cat sub_${VIDEO}.en.vtt.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment