Created
January 2, 2025 02:58
-
-
Save essingen123/95adec40d2e5022c139fe3e91130b1dd to your computer and use it in GitHub Desktop.
YouTube playlist to transcription for accessibility and ai assisted processing etc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import os | |
import re | |
# Fast draft by Kilian Lindberg | |
# Purpose: Accessibility, ease transcript processing for AI in case of limited visual video interaction limitation | |
# Disclaimer: Code as is, no claims but an inspirational aspiration | |
# Install yt-dlp using pip | |
subprocess.run(['pip', 'install', 'yt-dlp']) | |
# URL of the playlist | |
playlist_url = 'https://youtube.com/playlist?list=PLf-tvI4zeKLIaHFaIfvla03TAedPHUsln' | |
# Command to get video URLs in the playlist | |
command = ['yt-dlp', '--flat-playlist', '-i', '--print-to-file', 'url', 'file.txt', playlist_url] | |
subprocess.run(command) | |
with open('file.txt', 'r') as file: | |
links = file.read().splitlines() | |
transcript_file = 'YouTube-transcripts.txt' | |
if os.path.exists(transcript_file): | |
os.remove(transcript_file) | |
for link in links: | |
video_id = link.split('=')[-1] | |
transcript_command = ['yt-dlp', '--write-auto-sub', '--skip-download', '--sub-lang', 'en', '-o', '%(id)s.%(ext)s', link] | |
subprocess.run(transcript_command) | |
transcript_filename = f'{video_id}.en.vtt' | |
try: | |
with open(transcript_filename, 'r') as tf: | |
transcript = tf.read() | |
raw_text = re.sub(r'\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}.*\n', '', transcript) # Remove time codes | |
raw_text = re.sub(r'<[^>]+>', '', raw_text) # Remove HTML tags | |
raw_text = raw_text.strip() | |
if raw_text: | |
with open(transcript_file, 'a') as af: | |
af.write(raw_text) | |
af.write('\n\n') | |
except FileNotFoundError: | |
print(f'Transcript for {link} not found.') | |
print('All transcripts have been collected in YouTube-transcripts.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment