essingen123 · January 2, 2025 02:58
diff --git a/yt_playlist_2_transcription.py b/yt_playlist_2_transcription.py
 import subprocess
 import os
 import re

 # Fast draft by Kilian Lindberg
 # Purpose: Accessibility, ease transcript processing for AI in case of limited visual video interaction limitation
 # Disclaimer: Code as is, no claims but an inspirational aspiration

 # Install yt-dlp using pip
 subprocess.run(['pip', 'install', 'yt-dlp'])

 # URL of the playlist
 playlist_url = 'https://youtube.com/playlist?list=PLf-tvI4zeKLIaHFaIfvla03TAedPHUsln'

 # Command to get video URLs in the playlist
 command = ['yt-dlp', '--flat-playlist', '-i', '--print-to-file', 'url', 'file.txt', playlist_url]
 subprocess.run(command)

 with open('file.txt', 'r') as file:
    links = file.read().splitlines()

 transcript_file = 'YouTube-transcripts.txt'
 if os.path.exists(transcript_file):
    os.remove(transcript_file)

 for link in links:
    video_id = link.split('=')[-1]
    transcript_command = ['yt-dlp', '--write-auto-sub', '--skip-download', '--sub-lang', 'en', '-o', '%(id)s.%(ext)s', link]
    subprocess.run(transcript_command)
    transcript_filename = f'{video_id}.en.vtt'
    try:
        with open(transcript_filename, 'r') as tf:
            transcript = tf.read()
            raw_text = re.sub(r'\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}.*\n', '', transcript)  # Remove time codes
            raw_text = re.sub(r'<[^>]+>', '', raw_text)  # Remove HTML tags
            raw_text = raw_text.strip()
            if raw_text:
                with open(transcript_file, 'a') as af:
                    af.write(raw_text)
                    af.write('\n\n')
    except FileNotFoundError:
        print(f'Transcript for {link} not found.')

 print('All transcripts have been collected in YouTube-transcripts.txt')
	import subprocess
	import os
	import re

	# Fast draft by Kilian Lindberg
	# Purpose: Accessibility, ease transcript processing for AI in case of limited visual video interaction limitation
	# Disclaimer: Code as is, no claims but an inspirational aspiration

	# Install yt-dlp using pip
	subprocess.run(['pip', 'install', 'yt-dlp'])

	# URL of the playlist
	playlist_url = 'https://youtube.com/playlist?list=PLf-tvI4zeKLIaHFaIfvla03TAedPHUsln'

	# Command to get video URLs in the playlist
	command = ['yt-dlp', '--flat-playlist', '-i', '--print-to-file', 'url', 'file.txt', playlist_url]
	subprocess.run(command)

	with open('file.txt', 'r') as file:
	links = file.read().splitlines()

	transcript_file = 'YouTube-transcripts.txt'
	if os.path.exists(transcript_file):
	os.remove(transcript_file)

	for link in links:
	video_id = link.split('=')[-1]
	transcript_command = ['yt-dlp', '--write-auto-sub', '--skip-download', '--sub-lang', 'en', '-o', '%(id)s.%(ext)s', link]
	subprocess.run(transcript_command)
	transcript_filename = f'{video_id}.en.vtt'
	try:
	with open(transcript_filename, 'r') as tf:
	transcript = tf.read()
	raw_text = re.sub(r'\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}.*\n', '', transcript) # Remove time codes
	raw_text = re.sub(r'<[^>]+>', '', raw_text) # Remove HTML tags
	raw_text = raw_text.strip()
	if raw_text:
	with open(transcript_file, 'a') as af:
	af.write(raw_text)
	af.write('\n\n')
	except FileNotFoundError:
	print(f'Transcript for {link} not found.')

	print('All transcripts have been collected in YouTube-transcripts.txt')