Last active
June 26, 2022 23:53
-
-
Save twobob/e778af99c52ad5de463d5a18e1bb3759 to your computer and use it in GitHub Desktop.
A version that processes multiple vtt formatted subtitles from YouTube and performs a rough mean wpm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import youtube_dl | |
import time | |
import re | |
import requests | |
import os | |
import sys | |
# Get the arguments from the command-line except the filename | |
argv = sys.argv[1:] | |
Url = "" | |
def down_sub(video_url, language): | |
# check if valid youtube_link and remove playlist ID from url if exists. | |
_temp = video_url.lower() | |
if "youtube.com" in _temp or "youtu.be" in _temp: | |
if '&list=' in video_url: | |
video_url = video_url.split('&list=')[0].strip() | |
ydl_opts = {'dump-json': True, | |
'writesubtitles': True, | |
'writeautomaticsub': True, | |
'quiet': True, | |
"no_warnings": True, | |
"logtostderr": True, | |
'youtube_include_dash_manifest': False} | |
try: | |
with youtube_dl.YoutubeDL(ydl_opts) as (ydl): | |
info_dict = ydl.extract_info(video_url, download=False) | |
# print(info_dict) | |
if not info_dict['formats']: | |
print("Status : Something went wrong retry or video is unavailable") | |
return | |
except: | |
print('Error : Check your Internet Connection or Url.') | |
return | |
video_title = info_dict['title'] | |
# replacing reserved characters for windows for filename. | |
video_name = re.sub("[\\\\/*?:\"<>|]", "", video_title) | |
subtitles = info_dict.get('subtitles') | |
automatic_captions = info_dict.get('automatic_captions') | |
if subtitles: | |
subtitle = subtitles.get(language) | |
if subtitle: | |
for fmt in subtitle: | |
if fmt['ext'] == 'vtt': | |
sub_dlink = fmt['url'] | |
return [sub_dlink, video_name] | |
if automatic_captions: | |
subtitle = automatic_captions.get(language) | |
if subtitle: | |
for fmt in subtitle: | |
if fmt['ext'] == 'vtt': | |
sub_dlink = fmt['url'] | |
return [sub_dlink, video_name] | |
def main(): | |
# Check if the options' length is 1 | |
if len(argv) == 0 or len(argv) > 1: | |
url = input('Please input a video URL: ') | |
#print('using', url) | |
else: | |
url = argv[0] | |
#print('using', url) | |
ydl_opts = { | |
'writesubtitles': True, | |
'writeautomaticsub': True, | |
'subtitlesformat': 'vtt', | |
'subtitleslangs': ['en'], | |
'noplaylist': True | |
} | |
with youtube_dl.YoutubeDL(ydl_opts) as ydl: | |
#start = time.time() | |
info_dict = ydl.extract_info(url, download=False) | |
duration = info_dict.get('duration') | |
#print('The duration of the video is: ' + str(duration) + ' seconds') | |
subtitle = down_sub(url, 'en') | |
puresub = False | |
if subtitle: # check if not None | |
r = requests.get(subtitle[0]) # subtitle[0] download url. | |
with open(os.path.join('subtitles', f"{subtitle[1]}.vtt"), 'wb') as f: | |
parsed_subs = r.content.decode("utf-8") | |
if "align:start" not in parsed_subs: | |
puresub = True | |
print('Pure Subs') | |
saved_subs = r.content | |
else: # subtitle[1] video name. | |
f.write(r.content) | |
print('\n Subtitle Downloaded Successfully.') | |
if not puresub: | |
file = open(os.path.join('subtitles', f"{subtitle[1]}.vtt")) | |
content = file.readlines() | |
subs = content[6] | |
new = re.sub(r'\d{2}\W\d{2}\W\d{2}\W\d{3}\s\W{3}\s\d{2}\W\d{2}\W\d{2}\W\d{3}', '', subs) | |
# print(new) | |
totalcount = len(new.split(' ')) | |
else: | |
content = saved_subs.decode("utf-8") | |
new = re.sub(r'\d{2}\W\d{2}\W\d{2}\W\d{3}\s\W{3}\s\d{2}\W\d{2}\W\d{2}\W\d{3}', '', content) | |
flat = ' '.join(new.split()) | |
flat = flat.replace('WEBVTT Kind: captions Language: en', '') | |
print(flat) | |
totalcount = len(flat.split(' ')) | |
convert = time.strftime("%H:%M:%S", time.gmtime(duration)) | |
print(totalcount, 'total words in HMS:', convert) | |
wpm = int(totalcount / (duration / 60)) | |
print(wpm, 'ish wpm') | |
#elapsed = time.time() - start | |
#print(f"Computed in {elapsed:.2f}", 'seconds') | |
sys.exit(2) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If the extraneous output bothers your workflow: can edit
in Lib\site-packages\youtube_dl\extractor\Common.py by prepending a # to the self.to_screen
and elide the self.to_screen calls, in whatever downloader you are calling, with #'s.