Skip to content

Instantly share code, notes, and snippets.

@danuker
Last active September 3, 2025 08:05
Show Gist options
  • Save danuker/81cb7136f6e45528550d4a5cde9d045f to your computer and use it in GitHub Desktop.
Save danuker/81cb7136f6e45528550d4a5cde9d045f to your computer and use it in GitHub Desktop.
YouTube Summarizer
#!/usr/bin/env python3
# The MIT License (MIT)
# Copyright © 2025 Dan Gheorghe Haiduc
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# A script that downloads the transcript of a YouTube video, summarizes it,
# and then lets you chat with the language model about it.
# It uses a local llama.cpp LLM running at http://127.0.0.1:8080,
# and the yt-dlp package (`pip install -U yt-dlp`).
# The ETA is tuned for llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
# from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
# with 64GB of RAM, which starts at about 10-ish tokens/second, but gets slower as more
# history builds up.
import yt_dlp
import requests
import json
import sys
import re
import time
from typing import Optional, List, Dict
from datetime import datetime, timedelta
import urllib.parse
DEBUG = False
def debug(msg):
"""
Print but only if DEBUG is on.
"""
if DEBUG:
print(msg)
def download_transcript(video_url: str) -> Optional[str]:
"""
Download transcript from YouTube video using yt-dlp
"""
try:
# Try with different options to get subtitles
ydl_opts = {
'skip_download': True,
'writeautomaticsub': True,
'writesubtitles': True,
'subtitleslangs': ['en'],
'subtitlesformat': 'vtt',
'verbose': False,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(video_url, download=False)
video_id = info['id']
debug(f"Video ID: {video_id}")
# Check what's available in the info dict
debug(f"Available keys: {[k for k in info.keys() if 'sub' in k.lower() or 'capt' in k.lower()]}")
# Try to get automatic captions (this is what we're missing)
automatic_captions = info.get('automatic_captions', {})
debug(f"Automatic captions languages: {list(automatic_captions.keys())}")
if 'en' in automatic_captions:
debug("English automatic captions found")
# Get the English automatic captions
en_captions = automatic_captions['en']
debug(f"Available English caption formats: {len(en_captions)}")
# Try to find a VTT format (or any usable format)
vtt_caption = None
for caption in en_captions:
debug(f"Caption format: {caption}")
if caption.get('ext') == 'vtt':
vtt_caption = caption
break
# If no VTT found, use first available
if not vtt_caption:
vtt_caption = en_captions[0]
caption_url = vtt_caption['url']
debug(f"Using caption URL: {caption_url}")
# Download subtitle content
response = requests.get(caption_url)
response.raise_for_status()
# If response is M3U8, we need to parse it and fetch individual segments
if response.text.strip().startswith('#EXTM3U'):
debug("Detected M3U8 playlist, parsing segments...")
return parse_m3u8_playlist(response.text, caption_url)
else:
# Regular VTT file
vtt_content = response.text
plain_text = parse_vtt(vtt_content)
return plain_text
else:
debug("No English automatic captions found")
# Check if we can use requested_subtitles
requested_subtitles = info.get('requested_subtitles', {})
debug(f"Requested subtitles: {requested_subtitles}")
if 'en' in requested_subtitles:
# This should be the case where we requested English subtitles
sub_url = requested_subtitles['en']['url']
debug(f"Using requested subtitle URL: {sub_url}")
response = requests.get(sub_url)
response.raise_for_status()
vtt_content = response.text
plain_text = parse_vtt(vtt_content)
return plain_text
else:
debug("No requested subtitles found either")
return None
except Exception as e:
print(f"Error downloading transcript: {e}")
import traceback
traceback.print_exc()
return None
def parse_m3u8_playlist(m3u8_content: str, base_url: str) -> str:
"""
Parse M3U8 playlist and download all segments to reconstruct transcript
"""
lines = m3u8_content.strip().split('\n')
segments = []
# Extract segment URLs
segment_urls = []
for i, line in enumerate(lines):
if line.startswith('#EXTINF:'):
# Next line should be the segment URL
if i + 1 < len(lines):
segment_url = lines[i + 1]
if not segment_url.startswith('#'):
segment_urls.append(segment_url)
# Download each segment and concatenate
all_text = []
for segment_url in segment_urls:
try:
# Handle relative URLs
if not segment_url.startswith('http'):
parsed_base = urllib.parse.urlparse(base_url)
segment_url = urllib.parse.urljoin(f"{parsed_base.scheme}://{parsed_base.netloc}", segment_url)
debug(f"Downloading segment: {segment_url[:100]}...")
segment_response = requests.get(segment_url, timeout=30)
segment_response.raise_for_status()
# Parse this segment's VTT content
segment_text = parse_vtt(segment_response.text)
all_text.append(segment_text)
except Exception as e:
print(f"Error downloading segment {segment_url}: {e}")
continue
# Combine all text and deduplicate
combined_text = '\n'.join(all_text)
return deduplicate_lines(combined_text)
def deduplicate_lines(text: str) -> str:
"""
Remove duplicate consecutive lines
"""
lines = text.split('\n')
final_lines = []
prev_line = ""
for line in lines:
if line.strip() and line != prev_line:
final_lines.append(line.strip())
prev_line = line
return '\n'.join(final_lines)
def parse_vtt(vtt_content: str) -> str:
"""
Parse VTT subtitle format to plain text, handling YouTube's specific format
"""
lines = vtt_content.strip().split('\n')
text_lines = []
# YouTube's VTT format has special syntax like <00:00:00.440><c> I </c>
# We need to extract the text content between the tags
for line in lines:
# Skip VTT metadata lines
if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
continue
# Skip empty lines
if not line.strip():
continue
# Skip timing lines (lines with '-->' in them)
if ' --> ' in line:
continue
# Handle YouTube's special VTT format with timing tags
# Pattern: <00:00:00.440><c> I </c>
# We want to extract just the text content
if '<' in line and '>' in line:
# Remove all HTML-like tags but preserve text content
# This handles YouTube's format like <00:00:00.440><c> I </c>
cleaned_line = re.sub(r'<[^>]+>', '', line)
# Remove extra whitespace
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
if cleaned_line:
text_lines.append(cleaned_line)
else:
# Regular text line
text_lines.append(line.strip())
# Join lines but remove duplicate consecutive lines
final_lines = []
prev_line = ""
for line in text_lines:
if line != prev_line:
final_lines.append(line)
prev_line = line
return '\n'.join(final_lines)
def chat_with_llm(messages: List[Dict[str, str]], llm_url: str = "http://127.0.0.1:8080", stream: bool = True) -> str:
payload = {
"messages": messages,
"temperature": 0.6,
"max_tokens": 2000,
"stream": stream
}
try:
response = requests.post(f"{llm_url}/chat/completions", json=payload, stream=True)
response.raise_for_status()
full_reply = ""
for line in response.iter_lines():
if line:
decoded_line = line.decode("utf-8")
if decoded_line.startswith("data:"):
data_str = decoded_line[5:].strip()
if data_str == "[DONE]":
break
try:
data = json.loads(data_str)
content = data['choices'][0]['delta'].get('content', '')
if content:
print(content, end='', flush=True)
full_reply += content
except Exception:
continue
print("")
return full_reply
except Exception as e:
print(f"Error contacting LLM: {e}")
return ""
def summarize_with_llm(text: str, llm_url: str = "http://127.0.0.1:8080") -> Optional[str]:
messages = [
{"role": "system", "content": "You are a helpful assistant that summarizes YouTube video transcripts."},
{"role": "user", "content": f"Summarize the following YouTube video transcript:\n\n{text}"}
]
return chat_with_llm(messages, llm_url)
def interactive_console(initial_summary: str, llm_url: str = "http://127.0.0.1:8080"):
print("\nEntering interactive chat mode. Type 'exit' to quit.")
conversation = [
{"role": "system", "content": "You are a helpful assistant that answers questions about the YouTube video."},
{"role": "assistant", "content": initial_summary}
]
while True:
user_input = input("\n> ")
if user_input.lower() in ["exit", "quit"]:
print("Goodbye!")
break
conversation.append({"role": "user", "content": user_input})
reply = chat_with_llm(conversation, llm_url)
conversation.append({"role": "assistant", "content": reply})
def main():
start = time.time()
if len(sys.argv) != 2:
print("Usage: python youtube_summarizer.py <youtube_url>")
sys.exit(1)
video_url = sys.argv[1]
print("Downloading transcript...")
transcript = download_transcript(video_url)
if not transcript:
print("Failed to download transcript")
sys.exit(1)
print("Transcript downloaded successfully")
print(f"Transcript length: {len(transcript)} characters")
# Show first 200 chars to verify content
print("\nFirst 200 characters of transcript:")
print(transcript[:200] + "..." if len(transcript) > 200 else transcript)
# Tuned to llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
# from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
# with 64GB of RAM
estimated_minutes = len(transcript)*.0003032768268323+.054881491
start_time = datetime.now()
completion_time = start_time + timedelta(minutes=estimated_minutes)
print(f"ETA of completion: {completion_time.strftime('%Y-%m-%d %H:%M:%S')}")
print("\nSummarizing with LLM...")
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
summary = summarize_with_llm(transcript)
print("="*50)
if not summary:
print("Failed to get summary from LLM")
sys.exit(1)
end = time.time()
print(f"Took {(end-start)/60:.2f} minutes.")
interactive_console(summary)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment