Last active
September 3, 2025 08:05
-
-
Save danuker/81cb7136f6e45528550d4a5cde9d045f to your computer and use it in GitHub Desktop.
YouTube Summarizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# The MIT License (MIT) | |
# Copyright © 2025 Dan Gheorghe Haiduc | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
# A script that downloads the transcript of a YouTube video, summarizes it, | |
# and then lets you chat with the language model about it. | |
# It uses a local llama.cpp LLM running at http://127.0.0.1:8080, | |
# and the yt-dlp package (`pip install -U yt-dlp`). | |
# The ETA is tuned for llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf | |
# from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz | |
# with 64GB of RAM, which starts at about 10-ish tokens/second, but gets slower as more | |
# history builds up. | |
import yt_dlp | |
import requests | |
import json | |
import sys | |
import re | |
import time | |
from typing import Optional, List, Dict | |
from datetime import datetime, timedelta | |
import urllib.parse | |
DEBUG = False | |
def debug(msg): | |
""" | |
Print but only if DEBUG is on. | |
""" | |
if DEBUG: | |
print(msg) | |
def download_transcript(video_url: str) -> Optional[str]: | |
""" | |
Download transcript from YouTube video using yt-dlp | |
""" | |
try: | |
# Try with different options to get subtitles | |
ydl_opts = { | |
'skip_download': True, | |
'writeautomaticsub': True, | |
'writesubtitles': True, | |
'subtitleslangs': ['en'], | |
'subtitlesformat': 'vtt', | |
'verbose': False, | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info = ydl.extract_info(video_url, download=False) | |
video_id = info['id'] | |
debug(f"Video ID: {video_id}") | |
# Check what's available in the info dict | |
debug(f"Available keys: {[k for k in info.keys() if 'sub' in k.lower() or 'capt' in k.lower()]}") | |
# Try to get automatic captions (this is what we're missing) | |
automatic_captions = info.get('automatic_captions', {}) | |
debug(f"Automatic captions languages: {list(automatic_captions.keys())}") | |
if 'en' in automatic_captions: | |
debug("English automatic captions found") | |
# Get the English automatic captions | |
en_captions = automatic_captions['en'] | |
debug(f"Available English caption formats: {len(en_captions)}") | |
# Try to find a VTT format (or any usable format) | |
vtt_caption = None | |
for caption in en_captions: | |
debug(f"Caption format: {caption}") | |
if caption.get('ext') == 'vtt': | |
vtt_caption = caption | |
break | |
# If no VTT found, use first available | |
if not vtt_caption: | |
vtt_caption = en_captions[0] | |
caption_url = vtt_caption['url'] | |
debug(f"Using caption URL: {caption_url}") | |
# Download subtitle content | |
response = requests.get(caption_url) | |
response.raise_for_status() | |
# If response is M3U8, we need to parse it and fetch individual segments | |
if response.text.strip().startswith('#EXTM3U'): | |
debug("Detected M3U8 playlist, parsing segments...") | |
return parse_m3u8_playlist(response.text, caption_url) | |
else: | |
# Regular VTT file | |
vtt_content = response.text | |
plain_text = parse_vtt(vtt_content) | |
return plain_text | |
else: | |
debug("No English automatic captions found") | |
# Check if we can use requested_subtitles | |
requested_subtitles = info.get('requested_subtitles', {}) | |
debug(f"Requested subtitles: {requested_subtitles}") | |
if 'en' in requested_subtitles: | |
# This should be the case where we requested English subtitles | |
sub_url = requested_subtitles['en']['url'] | |
debug(f"Using requested subtitle URL: {sub_url}") | |
response = requests.get(sub_url) | |
response.raise_for_status() | |
vtt_content = response.text | |
plain_text = parse_vtt(vtt_content) | |
return plain_text | |
else: | |
debug("No requested subtitles found either") | |
return None | |
except Exception as e: | |
print(f"Error downloading transcript: {e}") | |
import traceback | |
traceback.print_exc() | |
return None | |
def parse_m3u8_playlist(m3u8_content: str, base_url: str) -> str: | |
""" | |
Parse M3U8 playlist and download all segments to reconstruct transcript | |
""" | |
lines = m3u8_content.strip().split('\n') | |
segments = [] | |
# Extract segment URLs | |
segment_urls = [] | |
for i, line in enumerate(lines): | |
if line.startswith('#EXTINF:'): | |
# Next line should be the segment URL | |
if i + 1 < len(lines): | |
segment_url = lines[i + 1] | |
if not segment_url.startswith('#'): | |
segment_urls.append(segment_url) | |
# Download each segment and concatenate | |
all_text = [] | |
for segment_url in segment_urls: | |
try: | |
# Handle relative URLs | |
if not segment_url.startswith('http'): | |
parsed_base = urllib.parse.urlparse(base_url) | |
segment_url = urllib.parse.urljoin(f"{parsed_base.scheme}://{parsed_base.netloc}", segment_url) | |
debug(f"Downloading segment: {segment_url[:100]}...") | |
segment_response = requests.get(segment_url, timeout=30) | |
segment_response.raise_for_status() | |
# Parse this segment's VTT content | |
segment_text = parse_vtt(segment_response.text) | |
all_text.append(segment_text) | |
except Exception as e: | |
print(f"Error downloading segment {segment_url}: {e}") | |
continue | |
# Combine all text and deduplicate | |
combined_text = '\n'.join(all_text) | |
return deduplicate_lines(combined_text) | |
def deduplicate_lines(text: str) -> str: | |
""" | |
Remove duplicate consecutive lines | |
""" | |
lines = text.split('\n') | |
final_lines = [] | |
prev_line = "" | |
for line in lines: | |
if line.strip() and line != prev_line: | |
final_lines.append(line.strip()) | |
prev_line = line | |
return '\n'.join(final_lines) | |
def parse_vtt(vtt_content: str) -> str: | |
""" | |
Parse VTT subtitle format to plain text, handling YouTube's specific format | |
""" | |
lines = vtt_content.strip().split('\n') | |
text_lines = [] | |
# YouTube's VTT format has special syntax like <00:00:00.440><c> I </c> | |
# We need to extract the text content between the tags | |
for line in lines: | |
# Skip VTT metadata lines | |
if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'): | |
continue | |
# Skip empty lines | |
if not line.strip(): | |
continue | |
# Skip timing lines (lines with '-->' in them) | |
if ' --> ' in line: | |
continue | |
# Handle YouTube's special VTT format with timing tags | |
# Pattern: <00:00:00.440><c> I </c> | |
# We want to extract just the text content | |
if '<' in line and '>' in line: | |
# Remove all HTML-like tags but preserve text content | |
# This handles YouTube's format like <00:00:00.440><c> I </c> | |
cleaned_line = re.sub(r'<[^>]+>', '', line) | |
# Remove extra whitespace | |
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip() | |
if cleaned_line: | |
text_lines.append(cleaned_line) | |
else: | |
# Regular text line | |
text_lines.append(line.strip()) | |
# Join lines but remove duplicate consecutive lines | |
final_lines = [] | |
prev_line = "" | |
for line in text_lines: | |
if line != prev_line: | |
final_lines.append(line) | |
prev_line = line | |
return '\n'.join(final_lines) | |
def chat_with_llm(messages: List[Dict[str, str]], llm_url: str = "http://127.0.0.1:8080", stream: bool = True) -> str: | |
payload = { | |
"messages": messages, | |
"temperature": 0.6, | |
"max_tokens": 2000, | |
"stream": stream | |
} | |
try: | |
response = requests.post(f"{llm_url}/chat/completions", json=payload, stream=True) | |
response.raise_for_status() | |
full_reply = "" | |
for line in response.iter_lines(): | |
if line: | |
decoded_line = line.decode("utf-8") | |
if decoded_line.startswith("data:"): | |
data_str = decoded_line[5:].strip() | |
if data_str == "[DONE]": | |
break | |
try: | |
data = json.loads(data_str) | |
content = data['choices'][0]['delta'].get('content', '') | |
if content: | |
print(content, end='', flush=True) | |
full_reply += content | |
except Exception: | |
continue | |
print("") | |
return full_reply | |
except Exception as e: | |
print(f"Error contacting LLM: {e}") | |
return "" | |
def summarize_with_llm(text: str, llm_url: str = "http://127.0.0.1:8080") -> Optional[str]: | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant that summarizes YouTube video transcripts."}, | |
{"role": "user", "content": f"Summarize the following YouTube video transcript:\n\n{text}"} | |
] | |
return chat_with_llm(messages, llm_url) | |
def interactive_console(initial_summary: str, llm_url: str = "http://127.0.0.1:8080"): | |
print("\nEntering interactive chat mode. Type 'exit' to quit.") | |
conversation = [ | |
{"role": "system", "content": "You are a helpful assistant that answers questions about the YouTube video."}, | |
{"role": "assistant", "content": initial_summary} | |
] | |
while True: | |
user_input = input("\n> ") | |
if user_input.lower() in ["exit", "quit"]: | |
print("Goodbye!") | |
break | |
conversation.append({"role": "user", "content": user_input}) | |
reply = chat_with_llm(conversation, llm_url) | |
conversation.append({"role": "assistant", "content": reply}) | |
def main(): | |
start = time.time() | |
if len(sys.argv) != 2: | |
print("Usage: python youtube_summarizer.py <youtube_url>") | |
sys.exit(1) | |
video_url = sys.argv[1] | |
print("Downloading transcript...") | |
transcript = download_transcript(video_url) | |
if not transcript: | |
print("Failed to download transcript") | |
sys.exit(1) | |
print("Transcript downloaded successfully") | |
print(f"Transcript length: {len(transcript)} characters") | |
# Show first 200 chars to verify content | |
print("\nFirst 200 characters of transcript:") | |
print(transcript[:200] + "..." if len(transcript) > 200 else transcript) | |
# Tuned to llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf | |
# from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz | |
# with 64GB of RAM | |
estimated_minutes = len(transcript)*.0003032768268323+.054881491 | |
start_time = datetime.now() | |
completion_time = start_time + timedelta(minutes=estimated_minutes) | |
print(f"ETA of completion: {completion_time.strftime('%Y-%m-%d %H:%M:%S')}") | |
print("\nSummarizing with LLM...") | |
print("\n" + "="*50) | |
print("SUMMARY") | |
print("="*50) | |
summary = summarize_with_llm(transcript) | |
print("="*50) | |
if not summary: | |
print("Failed to get summary from LLM") | |
sys.exit(1) | |
end = time.time() | |
print(f"Took {(end-start)/60:.2f} minutes.") | |
interactive_console(summary) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment