-
-
Save glasslion/b2fcad16bc8a9630dbd7a945ab5ebf5e to your computer and use it in GitHub Desktop.
""" | |
Convert YouTube subtitles(vtt) to human readable text. | |
Download only subtitles from YouTube with youtube-dl: | |
youtube-dl --skip-download --convert-subs vtt <video_url> | |
Note that default subtitle format provided by YouTube is ass, which is hard | |
to process with simple regex. Luckily youtube-dl can convert ass to vtt, which | |
is easier to process. | |
To conver all vtt files inside a directory: | |
find . -name "*.vtt" -exec python vtt2text.py {} \; | |
""" | |
import sys | |
import re | |
def remove_tags(text): | |
""" | |
Remove vtt markup tags | |
""" | |
tags = [ | |
r'</c>', | |
r'<c(\.color\w+)?>', | |
r'<\d{2}:\d{2}:\d{2}\.\d{3}>', | |
] | |
for pat in tags: | |
text = re.sub(pat, '', text) | |
# extract timestamp, only kep HH:MM | |
text = re.sub( | |
r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%', | |
r'\g<1>', | |
text | |
) | |
text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) | |
return text | |
def remove_header(lines): | |
""" | |
Remove vtt file header | |
""" | |
pos = -1 | |
for mark in ('##', 'Language: en',): | |
if mark in lines: | |
pos = lines.index(mark) | |
lines = lines[pos+1:] | |
return lines | |
def merge_duplicates(lines): | |
""" | |
Remove duplicated subtitles. Duplacates are always adjacent. | |
""" | |
last_timestamp = '' | |
last_cap = '' | |
for line in lines: | |
if line == "": | |
continue | |
if re.match('^\d{2}:\d{2}$', line): | |
if line != last_timestamp: | |
yield line | |
last_timestamp = line | |
else: | |
if line != last_cap: | |
yield line | |
last_cap = line | |
def merge_short_lines(lines): | |
buffer = '' | |
for line in lines: | |
if line == "" or re.match('^\d{2}:\d{2}$', line): | |
yield '\n' + line | |
continue | |
if len(line+buffer) < 80: | |
buffer += ' ' + line | |
else: | |
yield buffer.strip() | |
buffer = line | |
yield buffer | |
def main(): | |
vtt_file_name = sys.argv[1] | |
txt_name = re.sub(r'.vtt$', '.txt', vtt_file_name) | |
with open(vtt_file_name) as f: | |
text = f.read() | |
text = remove_tags(text) | |
lines = text.splitlines() | |
lines = remove_header(lines) | |
lines = merge_duplicates(lines) | |
lines = list(lines) | |
lines = merge_short_lines(lines) | |
lines = list(lines) | |
with open(txt_name, 'w') as f: | |
for line in lines: | |
f.write(line) | |
f.write("\n") | |
if __name__ == "__main__": | |
main() |
Would a command-line tool with interface below be welcome?
yt-text bZ6pA--F3D4 > subtitles.txt
or better with full URL?
yt-text https://youtu.be/bZ6pA--F3D4 > subtitles.txt
Would a command-line tool with interface below be welcome?
yt-text bZ6pA--F3D4 > subtitles.txt
or better with full URL?
yt-text https://youtu.be/bZ6pA--F3D4 > subtitles.txt
Yes, it would be 😁
EDIT: For anyone interested, https://gist.github.com/epogrebnyak/ba87ba52f779f7ebd93b04b2af1059aa
Hi everyone, wrapped this script here: https://github.com/epogrebnyak/justsubs
Sample usage:
from justsubs import Video
subs = Video("KzWS7gJX5Z8").subtitles(language="en-uYU-mmqFLq8")
subs.download()
print(subs.get_text_blocks()[:10])
print(subs.get_plain_text()[:550])
It seems simply "en"
does not work, need "en-uYU-mmqFLq8"
.
Also pip install justsubs
should work
For YouTube subtitles, there were some timestamps and metadata remaining while using the script.
I've fixed it here:
https://gist.github.com/florentroques/c08bbe54fba42ec56c9d48229ed9c49b
Just found out this script after I made this one:
https://gist.github.com/arturmartins/1c78de3e8c21ffce81a17dc2f2181de4
Might be of help to some.