Last active
June 23, 2022 00:43
-
-
Save twobob/94ae44e62476d4ee65759792913a3d4a to your computer and use it in GitHub Desktop.
Fast and rough mean WPM approx using vtt files and meta info for approximation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import youtube_dl | |
import time | |
import re | |
import requests | |
import os | |
def downSub(video_url,language): | |
# check if valid youtube_link and remove playlist ID from url if exists. | |
_temp = video_url.lower() | |
if 'youtube.com' in _temp or 'youtu.be' in _temp: | |
if '&list=' in video_url: | |
video_url = video_url.split('&list=')[0].strip() | |
ydl_opts = {'dump-json':True, | |
'writesubtitles':True, | |
'writeautomaticsub':True, | |
'youtube_include_dash_manifest':False} | |
try: | |
with youtube_dl.YoutubeDL(ydl_opts) as (ydl): | |
info_dict = ydl.extract_info(video_url, download=False) | |
#print(info_dict) | |
if not info_dict['formats']: | |
print(text=' Status : Something went wrong retry or video is unavailable') | |
return | |
except: | |
print('Error : Check your Internet Connection or Url.') | |
return | |
video_title = info_dict['title'] | |
# replacing reserved characters for windows for filename. | |
video_name = re.sub('[\\\\/*?:"<>|]', '', video_title) | |
subtitles = info_dict.get('subtitles') | |
automatic_captions = info_dict.get('automatic_captions') | |
if subtitles: | |
subtitle = subtitles.get(language) | |
if subtitle: | |
for fmt in subtitle: | |
if fmt['ext']=='vtt': | |
sub_dlink = fmt['url'] | |
return [sub_dlink,video_name] | |
if automatic_captions: | |
subtitle = automatic_captions.get(language) | |
if subtitle: | |
for fmt in subtitle: | |
if fmt['ext']=='vtt': | |
sub_dlink = fmt['url'] | |
return [sub_dlink,video_name] | |
def main(): | |
# put your code here | |
ydl_opts = { | |
'writesubtitles': True, | |
#'skipdownload': True, | |
'writeautomaticsub': True, | |
'subtitlesformat': 'vtt', | |
'subtitleslangs': ['en'] | |
} | |
with youtube_dl.YoutubeDL(ydl_opts) as ydl: | |
Url = input('Please input a video URL: ') | |
start = time.time() | |
if len(Url) ==0: | |
Url = "https://www.youtube.com/watch?v=8HZbaYc0qhc" | |
else: | |
print ('using', Url) | |
info_dict = ydl.extract_info(Url, download=False) | |
duration = info_dict.get('duration') | |
print('The duration of the video is: ' + str(duration) + ' seconds') | |
subtitle = downSub(Url, 'en') | |
if subtitle: # check if not None | |
r = requests.get(subtitle[0]) # subtitle[0] download url. | |
with open(os.path.join('subtitles',f"{subtitle[1]}.vtt"),'wb') as f: # subtitle[1] video name. | |
f.write(r.content) | |
print('\n Subtitle Downloaded Successfully.') | |
file = open(os.path.join('subtitles',f"{subtitle[1]}.vtt")) | |
content = file.readlines() | |
subs = content[6] | |
new = re.sub(r'\d{2}\W\d{2}\W\d{2}\W\d{3}\s\W{3}\s\d{2}\W\d{2}\W\d{2}\W\d{3}','',subs) | |
print(new) | |
totalcount = len(new.split(' ')) | |
print(totalcount, 'total') | |
# i=0 | |
# for words in subs.split('<c>'): | |
# i=i+1; | |
# print('['+str(i)+'|', end= words+'] ') | |
wpm = totalcount / (duration / 60) | |
print(wpm,'wpm') | |
elapsed = time.time() - start | |
print(elapsed, 'seconds') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment