Skip to content

Instantly share code, notes, and snippets.

@twobob
Last active June 26, 2022 23:53
Show Gist options
  • Save twobob/e778af99c52ad5de463d5a18e1bb3759 to your computer and use it in GitHub Desktop.
Save twobob/e778af99c52ad5de463d5a18e1bb3759 to your computer and use it in GitHub Desktop.
A version that processes multiple vtt formatted subtitles from YouTube and performs a rough mean wpm
import youtube_dl
import time
import re
import requests
import os
import sys
# Get the arguments from the command-line except the filename
argv = sys.argv[1:]
Url = ""
def down_sub(video_url, language):
# check if valid youtube_link and remove playlist ID from url if exists.
_temp = video_url.lower()
if "youtube.com" in _temp or "youtu.be" in _temp:
if '&list=' in video_url:
video_url = video_url.split('&list=')[0].strip()
ydl_opts = {'dump-json': True,
'writesubtitles': True,
'writeautomaticsub': True,
'quiet': True,
"no_warnings": True,
"logtostderr": True,
'youtube_include_dash_manifest': False}
try:
with youtube_dl.YoutubeDL(ydl_opts) as (ydl):
info_dict = ydl.extract_info(video_url, download=False)
# print(info_dict)
if not info_dict['formats']:
print("Status : Something went wrong retry or video is unavailable")
return
except:
print('Error : Check your Internet Connection or Url.')
return
video_title = info_dict['title']
# replacing reserved characters for windows for filename.
video_name = re.sub("[\\\\/*?:\"<>|]", "", video_title)
subtitles = info_dict.get('subtitles')
automatic_captions = info_dict.get('automatic_captions')
if subtitles:
subtitle = subtitles.get(language)
if subtitle:
for fmt in subtitle:
if fmt['ext'] == 'vtt':
sub_dlink = fmt['url']
return [sub_dlink, video_name]
if automatic_captions:
subtitle = automatic_captions.get(language)
if subtitle:
for fmt in subtitle:
if fmt['ext'] == 'vtt':
sub_dlink = fmt['url']
return [sub_dlink, video_name]
def main():
# Check if the options' length is 1
if len(argv) == 0 or len(argv) > 1:
url = input('Please input a video URL: ')
#print('using', url)
else:
url = argv[0]
#print('using', url)
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True,
'subtitlesformat': 'vtt',
'subtitleslangs': ['en'],
'noplaylist': True
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
#start = time.time()
info_dict = ydl.extract_info(url, download=False)
duration = info_dict.get('duration')
#print('The duration of the video is: ' + str(duration) + ' seconds')
subtitle = down_sub(url, 'en')
puresub = False
if subtitle: # check if not None
r = requests.get(subtitle[0]) # subtitle[0] download url.
with open(os.path.join('subtitles', f"{subtitle[1]}.vtt"), 'wb') as f:
parsed_subs = r.content.decode("utf-8")
if "align:start" not in parsed_subs:
puresub = True
print('Pure Subs')
saved_subs = r.content
else: # subtitle[1] video name.
f.write(r.content)
print('\n Subtitle Downloaded Successfully.')
if not puresub:
file = open(os.path.join('subtitles', f"{subtitle[1]}.vtt"))
content = file.readlines()
subs = content[6]
new = re.sub(r'\d{2}\W\d{2}\W\d{2}\W\d{3}\s\W{3}\s\d{2}\W\d{2}\W\d{2}\W\d{3}', '', subs)
# print(new)
totalcount = len(new.split(' '))
else:
content = saved_subs.decode("utf-8")
new = re.sub(r'\d{2}\W\d{2}\W\d{2}\W\d{3}\s\W{3}\s\d{2}\W\d{2}\W\d{2}\W\d{3}', '', content)
flat = ' '.join(new.split())
flat = flat.replace('WEBVTT Kind: captions Language: en', '')
print(flat)
totalcount = len(flat.split(' '))
convert = time.strftime("%H:%M:%S", time.gmtime(duration))
print(totalcount, 'total words in HMS:', convert)
wpm = int(totalcount / (duration / 60))
print(wpm, 'ish wpm')
#elapsed = time.time() - start
#print(f"Computed in {elapsed:.2f}", 'seconds')
sys.exit(2)
if __name__ == '__main__':
main()
@twobob
Copy link
Author

twobob commented Jun 26, 2022

If the extraneous output bothers your workflow: can edit

def report_download_webpage(self, video_id):
    """Report webpage download."""
    self.to_screen('%s: Downloading webpage' % video_id)

in Lib\site-packages\youtube_dl\extractor\Common.py by prepending a # to the self.to_screen
and elide the self.to_screen calls, in whatever downloader you are calling, with #'s.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment