Created
February 27, 2023 04:11
-
-
Save dauuricus/c48b4e2c56db1ecd46b5f45e39592c8f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install youtube_transcript_api | |
# -*- coding: utf-8 -*- | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from urllib.parse import urlparse, parse_qs | |
urltext = "https://youtu.be/jzTjCgWlFRU" | |
args = [urltext] | |
def extract_video_id(url): | |
query = urlparse(url) | |
if query.hostname == 'youtu.be': return query.path[1:] | |
if query.hostname in {'www.youtube.com', 'youtube.com'}: | |
if query.path == '/watch': return parse_qs(query.query)['v'][0] | |
if query.path[:7] == '/embed/': return query.path.split('/')[2] | |
if query.path[:3] == '/v/': return query.path.split('/')[2] | |
# fail? | |
else: | |
return None | |
for url in args: | |
video_id = extract_video_id(url) | |
print('youtube video_id:',video_id) | |
print() | |
line =[] | |
line[:] = YouTubeTranscriptApi.get_transcript(video_id,languages=['en']) | |
transcript = YouTubeTranscriptApi.list_transcripts(video_id) | |
english = transcript.find_transcript(['en']) | |
#print(english.fetch()) | |
caption_line = [] | |
for count, dict_obj in enumerate(english.fetch()): | |
print("time", dict_obj['start'] , ': ',end='') | |
print(dict_obj['text'] ) | |
caption_line.append(dict_obj['text']) | |
print() | |
print() | |
print("************************************************************************************") | |
print() | |
print("Youtube captions") | |
print("- - - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -") | |
print() | |
print() | |
translated = english.translate('ja') | |
for count, dict_obj in enumerate(translated.fetch()):# japanese | |
print("time", dict_obj['start'] , ": ",end='') | |
print(dict_obj['text']) | |
# text_list = [] | |
# for l in line: | |
# #print("start:", l['start']) | |
# print(l['text']) | |
# #print() | |
# #print("duration:", l['duration']) | |
# temp1 = l['text'] | |
# temp2 = temp1.rstrip() | |
# text_list.append(temp2) | |
# del line | |
#for text in text_list: | |
# print(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment