Skip to content

Instantly share code, notes, and snippets.

@essingen123
Created January 2, 2025 02:58
Show Gist options
  • Save essingen123/95adec40d2e5022c139fe3e91130b1dd to your computer and use it in GitHub Desktop.
Save essingen123/95adec40d2e5022c139fe3e91130b1dd to your computer and use it in GitHub Desktop.
YouTube playlist to transcription for accessibility and ai assisted processing etc
import subprocess
import os
import re
# Fast draft by Kilian Lindberg
# Purpose: Accessibility, ease transcript processing for AI in case of limited visual video interaction limitation
# Disclaimer: Code as is, no claims but an inspirational aspiration
# Install yt-dlp using pip
subprocess.run(['pip', 'install', 'yt-dlp'])
# URL of the playlist
playlist_url = 'https://youtube.com/playlist?list=PLf-tvI4zeKLIaHFaIfvla03TAedPHUsln'
# Command to get video URLs in the playlist
command = ['yt-dlp', '--flat-playlist', '-i', '--print-to-file', 'url', 'file.txt', playlist_url]
subprocess.run(command)
with open('file.txt', 'r') as file:
links = file.read().splitlines()
transcript_file = 'YouTube-transcripts.txt'
if os.path.exists(transcript_file):
os.remove(transcript_file)
for link in links:
video_id = link.split('=')[-1]
transcript_command = ['yt-dlp', '--write-auto-sub', '--skip-download', '--sub-lang', 'en', '-o', '%(id)s.%(ext)s', link]
subprocess.run(transcript_command)
transcript_filename = f'{video_id}.en.vtt'
try:
with open(transcript_filename, 'r') as tf:
transcript = tf.read()
raw_text = re.sub(r'\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}.*\n', '', transcript) # Remove time codes
raw_text = re.sub(r'<[^>]+>', '', raw_text) # Remove HTML tags
raw_text = raw_text.strip()
if raw_text:
with open(transcript_file, 'a') as af:
af.write(raw_text)
af.write('\n\n')
except FileNotFoundError:
print(f'Transcript for {link} not found.')
print('All transcripts have been collected in YouTube-transcripts.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment