danuker · September 3, 2025 08:05
diff --git a/youtube_summarizer.py b/youtube_summarizer.py
 #!/usr/bin/env python3

 # The MIT License (MIT)

 # Copyright © 2025 Dan Gheorghe Haiduc
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 # A script that downloads the transcript of a YouTube video, summarizes it,
 # and then lets you chat with the language model about it.
 # It uses a local llama.cpp LLM running at http://127.0.0.1:8080,
 # and the yt-dlp package (`pip install -U yt-dlp`).

 # The ETA is tuned for llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
 # from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
 # with 64GB of RAM, which starts at about 10-ish tokens/second, but gets slower as more
 # history builds up.

 import yt_dlp
 import requests
 import json
 import sys
 import re
 import time
 from typing import Optional, List, Dict
 from datetime import datetime, timedelta
 import urllib.parse


 DEBUG = False

 def debug(msg):
    """
    Print but only if DEBUG is on.
    """
    if DEBUG:
        print(msg)

 def download_transcript(video_url: str) -> Optional[str]:
    """
    Download transcript from YouTube video using yt-dlp
    """
    try:
        # Try with different options to get subtitles
        ydl_opts = {
            'skip_download': True,
            'writeautomaticsub': True,
            'writesubtitles': True,
            'subtitleslangs': ['en'],
            'subtitlesformat': 'vtt',
            'verbose': False,
        }
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=False)
            video_id = info['id']
            debug(f"Video ID: {video_id}")
            
            # Check what's available in the info dict
            debug(f"Available keys: {[k for k in info.keys() if 'sub' in k.lower() or 'capt' in k.lower()]}")
            
            # Try to get automatic captions (this is what we're missing)
            automatic_captions = info.get('automatic_captions', {})
            debug(f"Automatic captions languages: {list(automatic_captions.keys())}")
            
            if 'en' in automatic_captions:
                debug("English automatic captions found")
                # Get the English automatic captions
                en_captions = automatic_captions['en']
                debug(f"Available English caption formats: {len(en_captions)}")
                
                # Try to find a VTT format (or any usable format)
                vtt_caption = None
                for caption in en_captions:
                    debug(f"Caption format: {caption}")
                    if caption.get('ext') == 'vtt':
                        vtt_caption = caption
                        break
                
                # If no VTT found, use first available
                if not vtt_caption:
                    vtt_caption = en_captions[0]
                
                caption_url = vtt_caption['url']
                debug(f"Using caption URL: {caption_url}")
                
                # Download subtitle content
                response = requests.get(caption_url)
                response.raise_for_status()
                
                # If response is M3U8, we need to parse it and fetch individual segments
                if response.text.strip().startswith('#EXTM3U'):
                    debug("Detected M3U8 playlist, parsing segments...")
                    return parse_m3u8_playlist(response.text, caption_url)
                else:
                    # Regular VTT file
                    vtt_content = response.text
                    plain_text = parse_vtt(vtt_content)
                    return plain_text
            else:
                debug("No English automatic captions found")
                # Check if we can use requested_subtitles
                requested_subtitles = info.get('requested_subtitles', {})
                debug(f"Requested subtitles: {requested_subtitles}")
                
                if 'en' in requested_subtitles:
                    # This should be the case where we requested English subtitles
                    sub_url = requested_subtitles['en']['url']
                    debug(f"Using requested subtitle URL: {sub_url}")
                    response = requests.get(sub_url)
                    response.raise_for_status()
                    vtt_content = response.text
                    plain_text = parse_vtt(vtt_content)
                    return plain_text
                else:
                    debug("No requested subtitles found either")
                    return None
                    
    except Exception as e:
        print(f"Error downloading transcript: {e}")
        import traceback
        traceback.print_exc()
        return None


 def parse_m3u8_playlist(m3u8_content: str, base_url: str) -> str:
    """
    Parse M3U8 playlist and download all segments to reconstruct transcript
    """
    lines = m3u8_content.strip().split('\n')
    segments = []
    
    # Extract segment URLs
    segment_urls = []
    for i, line in enumerate(lines):
        if line.startswith('#EXTINF:'):
            # Next line should be the segment URL
            if i + 1 < len(lines):
                segment_url = lines[i + 1]
                if not segment_url.startswith('#'):
                    segment_urls.append(segment_url)
    
    # Download each segment and concatenate
    all_text = []
    for segment_url in segment_urls:
        try:
            # Handle relative URLs
            if not segment_url.startswith('http'):
                parsed_base = urllib.parse.urlparse(base_url)
                segment_url = urllib.parse.urljoin(f"{parsed_base.scheme}://{parsed_base.netloc}", segment_url)
            
            debug(f"Downloading segment: {segment_url[:100]}...")
            segment_response = requests.get(segment_url, timeout=30)
            segment_response.raise_for_status()
            
            # Parse this segment's VTT content
            segment_text = parse_vtt(segment_response.text)
            all_text.append(segment_text)
            
        except Exception as e:
            print(f"Error downloading segment {segment_url}: {e}")
            continue
    
    # Combine all text and deduplicate
    combined_text = '\n'.join(all_text)
    return deduplicate_lines(combined_text)


 def deduplicate_lines(text: str) -> str:
    """
    Remove duplicate consecutive lines
    """
    lines = text.split('\n')
    final_lines = []
    prev_line = ""
    for line in lines:
        if line.strip() and line != prev_line:
            final_lines.append(line.strip())
            prev_line = line
    return '\n'.join(final_lines)


 def parse_vtt(vtt_content: str) -> str:
    """
    Parse VTT subtitle format to plain text, handling YouTube's specific format
    """
    lines = vtt_content.strip().split('\n')
    text_lines = []
    
    # YouTube's VTT format has special syntax like <00:00:00.440><c> I </c>
    # We need to extract the text content between the tags
    
    for line in lines:
        # Skip VTT metadata lines
        if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
            continue
        # Skip empty lines
        if not line.strip():
            continue
        # Skip timing lines (lines with '-->' in them)
        if ' --> ' in line:
            continue
            
        # Handle YouTube's special VTT format with timing tags
        # Pattern: <00:00:00.440><c> I </c>
        # We want to extract just the text content
        if '<' in line and '>' in line:
            # Remove all HTML-like tags but preserve text content
            # This handles YouTube's format like <00:00:00.440><c> I </c>
            cleaned_line = re.sub(r'<[^>]+>', '', line)
            # Remove extra whitespace
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
            if cleaned_line:
                text_lines.append(cleaned_line)
        else:
            # Regular text line
            text_lines.append(line.strip())
    
    # Join lines but remove duplicate consecutive lines
    final_lines = []
    prev_line = ""
    for line in text_lines:
        if line != prev_line:
            final_lines.append(line)
            prev_line = line
    
    return '\n'.join(final_lines)


 def chat_with_llm(messages: List[Dict[str, str]], llm_url: str = "http://127.0.0.1:8080", stream: bool = True) -> str:
    payload = {
        "messages": messages,
        "temperature": 0.6,
        "max_tokens": 2000,
        "stream": stream
    }
    try:
        response = requests.post(f"{llm_url}/chat/completions", json=payload, stream=True)
        response.raise_for_status()
        full_reply = ""
        for line in response.iter_lines():
            if line:
                decoded_line = line.decode("utf-8")
                if decoded_line.startswith("data:"):
                    data_str = decoded_line[5:].strip()
                    if data_str == "[DONE]":
                        break
                    try:
                        data = json.loads(data_str)
                        content = data['choices'][0]['delta'].get('content', '')
                        if content:
                            print(content, end='', flush=True)
                            full_reply += content
                    except Exception:
                        continue
        print("")
        return full_reply
    except Exception as e:
        print(f"Error contacting LLM: {e}")
        return ""


 def summarize_with_llm(text: str, llm_url: str = "http://127.0.0.1:8080") -> Optional[str]:
    messages = [
        {"role": "system", "content": "You are a helpful assistant that summarizes YouTube video transcripts."},
        {"role": "user", "content": f"Summarize the following YouTube video transcript:\n\n{text}"}
    ]
    return chat_with_llm(messages, llm_url)


 def interactive_console(initial_summary: str, llm_url: str = "http://127.0.0.1:8080"):
    print("\nEntering interactive chat mode. Type 'exit' to quit.")
    conversation = [
        {"role": "system", "content": "You are a helpful assistant that answers questions about the YouTube video."},
        {"role": "assistant", "content": initial_summary}
    ]
    while True:
        user_input = input("\n> ")
        if user_input.lower() in ["exit", "quit"]:
            print("Goodbye!")
            break
        conversation.append({"role": "user", "content": user_input})
        reply = chat_with_llm(conversation, llm_url)
        conversation.append({"role": "assistant", "content": reply})


 def main():
    start = time.time()
    if len(sys.argv) != 2:
        print("Usage: python youtube_summarizer.py <youtube_url>")
        sys.exit(1)
    
    video_url = sys.argv[1]
    
    print("Downloading transcript...")
    transcript = download_transcript(video_url)
    
    if not transcript:
        print("Failed to download transcript")
        sys.exit(1)
    
    print("Transcript downloaded successfully")
    print(f"Transcript length: {len(transcript)} characters")
    
    # Show first 200 chars to verify content
    print("\nFirst 200 characters of transcript:")
    print(transcript[:200] + "..." if len(transcript) > 200 else transcript)

    # Tuned to llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
    # from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
    # with 64GB of RAM
    estimated_minutes = len(transcript)*.0003032768268323+.054881491
    start_time = datetime.now()
    completion_time = start_time + timedelta(minutes=estimated_minutes)
    print(f"ETA of completion: {completion_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    print("\nSummarizing with LLM...")
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    summary = summarize_with_llm(transcript)
    print("="*50)
    
    if not summary:
        print("Failed to get summary from LLM")
        sys.exit(1)
    
    end = time.time()
    print(f"Took {(end-start)/60:.2f} minutes.")
    interactive_console(summary)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# The MIT License (MIT)

	# Copyright © 2025 Dan Gheorghe Haiduc
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	# A script that downloads the transcript of a YouTube video, summarizes it,
	# and then lets you chat with the language model about it.
	# It uses a local llama.cpp LLM running at http://127.0.0.1:8080,
	# and the yt-dlp package (`pip install -U yt-dlp`).

	# The ETA is tuned for llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
	# from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
	# with 64GB of RAM, which starts at about 10-ish tokens/second, but gets slower as more
	# history builds up.

	import yt_dlp
	import requests
	import json
	import sys
	import re
	import time
	from typing import Optional, List, Dict
	from datetime import datetime, timedelta
	import urllib.parse


	DEBUG = False

	def debug(msg):
	"""
	Print but only if DEBUG is on.
	"""
	if DEBUG:
	print(msg)

	def download_transcript(video_url: str) -> Optional[str]:
	"""
	Download transcript from YouTube video using yt-dlp
	"""
	try:
	# Try with different options to get subtitles
	ydl_opts = {
	'skip_download': True,
	'writeautomaticsub': True,
	'writesubtitles': True,
	'subtitleslangs': ['en'],
	'subtitlesformat': 'vtt',
	'verbose': False,
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(video_url, download=False)
	video_id = info['id']
	debug(f"Video ID: {video_id}")

	# Check what's available in the info dict
	debug(f"Available keys: {[k for k in info.keys() if 'sub' in k.lower() or 'capt' in k.lower()]}")

	# Try to get automatic captions (this is what we're missing)
	automatic_captions = info.get('automatic_captions', {})
	debug(f"Automatic captions languages: {list(automatic_captions.keys())}")

	if 'en' in automatic_captions:
	debug("English automatic captions found")
	# Get the English automatic captions
	en_captions = automatic_captions['en']
	debug(f"Available English caption formats: {len(en_captions)}")

	# Try to find a VTT format (or any usable format)
	vtt_caption = None
	for caption in en_captions:
	debug(f"Caption format: {caption}")
	if caption.get('ext') == 'vtt':
	vtt_caption = caption
	break

	# If no VTT found, use first available
	if not vtt_caption:
	vtt_caption = en_captions[0]

	caption_url = vtt_caption['url']
	debug(f"Using caption URL: {caption_url}")

	# Download subtitle content
	response = requests.get(caption_url)
	response.raise_for_status()

	# If response is M3U8, we need to parse it and fetch individual segments
	if response.text.strip().startswith('#EXTM3U'):
	debug("Detected M3U8 playlist, parsing segments...")
	return parse_m3u8_playlist(response.text, caption_url)
	else:
	# Regular VTT file
	vtt_content = response.text
	plain_text = parse_vtt(vtt_content)
	return plain_text
	else:
	debug("No English automatic captions found")
	# Check if we can use requested_subtitles
	requested_subtitles = info.get('requested_subtitles', {})
	debug(f"Requested subtitles: {requested_subtitles}")

	if 'en' in requested_subtitles:
	# This should be the case where we requested English subtitles
	sub_url = requested_subtitles['en']['url']
	debug(f"Using requested subtitle URL: {sub_url}")
	response = requests.get(sub_url)
	response.raise_for_status()
	vtt_content = response.text
	plain_text = parse_vtt(vtt_content)
	return plain_text
	else:
	debug("No requested subtitles found either")
	return None

	except Exception as e:
	print(f"Error downloading transcript: {e}")
	import traceback
	traceback.print_exc()
	return None


	def parse_m3u8_playlist(m3u8_content: str, base_url: str) -> str:
	"""
	Parse M3U8 playlist and download all segments to reconstruct transcript
	"""
	lines = m3u8_content.strip().split('\n')
	segments = []

	# Extract segment URLs
	segment_urls = []
	for i, line in enumerate(lines):
	if line.startswith('#EXTINF:'):
	# Next line should be the segment URL
	if i + 1 < len(lines):
	segment_url = lines[i + 1]
	if not segment_url.startswith('#'):
	segment_urls.append(segment_url)

	# Download each segment and concatenate
	all_text = []
	for segment_url in segment_urls:
	try:
	# Handle relative URLs
	if not segment_url.startswith('http'):
	parsed_base = urllib.parse.urlparse(base_url)
	segment_url = urllib.parse.urljoin(f"{parsed_base.scheme}://{parsed_base.netloc}", segment_url)

	debug(f"Downloading segment: {segment_url[:100]}...")
	segment_response = requests.get(segment_url, timeout=30)
	segment_response.raise_for_status()

	# Parse this segment's VTT content
	segment_text = parse_vtt(segment_response.text)
	all_text.append(segment_text)

	except Exception as e:
	print(f"Error downloading segment {segment_url}: {e}")
	continue

	# Combine all text and deduplicate
	combined_text = '\n'.join(all_text)
	return deduplicate_lines(combined_text)


	def deduplicate_lines(text: str) -> str:
	"""
	Remove duplicate consecutive lines
	"""
	lines = text.split('\n')
	final_lines = []
	prev_line = ""
	for line in lines:
	if line.strip() and line != prev_line:
	final_lines.append(line.strip())
	prev_line = line
	return '\n'.join(final_lines)


	def parse_vtt(vtt_content: str) -> str:
	"""
	Parse VTT subtitle format to plain text, handling YouTube's specific format
	"""
	lines = vtt_content.strip().split('\n')
	text_lines = []

	# YouTube's VTT format has special syntax like <00:00:00.440><c> I </c>
	# We need to extract the text content between the tags

	for line in lines:
	# Skip VTT metadata lines
	if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
	continue
	# Skip empty lines
	if not line.strip():
	continue
	# Skip timing lines (lines with '-->' in them)
	if ' --> ' in line:
	continue

	# Handle YouTube's special VTT format with timing tags
	# Pattern: <00:00:00.440><c> I </c>
	# We want to extract just the text content
	if '<' in line and '>' in line:
	# Remove all HTML-like tags but preserve text content
	# This handles YouTube's format like <00:00:00.440><c> I </c>
	cleaned_line = re.sub(r'<[^>]+>', '', line)
	# Remove extra whitespace
	cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
	if cleaned_line:
	text_lines.append(cleaned_line)
	else:
	# Regular text line
	text_lines.append(line.strip())

	# Join lines but remove duplicate consecutive lines
	final_lines = []
	prev_line = ""
	for line in text_lines:
	if line != prev_line:
	final_lines.append(line)
	prev_line = line

	return '\n'.join(final_lines)


	def chat_with_llm(messages: List[Dict[str, str]], llm_url: str = "http://127.0.0.1:8080", stream: bool = True) -> str:
	payload = {
	"messages": messages,
	"temperature": 0.6,
	"max_tokens": 2000,
	"stream": stream
	}
	try:
	response = requests.post(f"{llm_url}/chat/completions", json=payload, stream=True)
	response.raise_for_status()
	full_reply = ""
	for line in response.iter_lines():
	if line:
	decoded_line = line.decode("utf-8")
	if decoded_line.startswith("data:"):
	data_str = decoded_line[5:].strip()
	if data_str == "[DONE]":
	break
	try:
	data = json.loads(data_str)
	content = data['choices'][0]['delta'].get('content', '')
	if content:
	print(content, end='', flush=True)
	full_reply += content
	except Exception:
	continue
	print("")
	return full_reply
	except Exception as e:
	print(f"Error contacting LLM: {e}")
	return ""


	def summarize_with_llm(text: str, llm_url: str = "http://127.0.0.1:8080") -> Optional[str]:
	messages = [
	{"role": "system", "content": "You are a helpful assistant that summarizes YouTube video transcripts."},
	{"role": "user", "content": f"Summarize the following YouTube video transcript:\n\n{text}"}
	]
	return chat_with_llm(messages, llm_url)


	def interactive_console(initial_summary: str, llm_url: str = "http://127.0.0.1:8080"):
	print("\nEntering interactive chat mode. Type 'exit' to quit.")
	conversation = [
	{"role": "system", "content": "You are a helpful assistant that answers questions about the YouTube video."},
	{"role": "assistant", "content": initial_summary}
	]
	while True:
	user_input = input("\n> ")
	if user_input.lower() in ["exit", "quit"]:
	print("Goodbye!")
	break
	conversation.append({"role": "user", "content": user_input})
	reply = chat_with_llm(conversation, llm_url)
	conversation.append({"role": "assistant", "content": reply})


	def main():
	start = time.time()
	if len(sys.argv) != 2:
	print("Usage: python youtube_summarizer.py <youtube_url>")
	sys.exit(1)

	video_url = sys.argv[1]

	print("Downloading transcript...")
	transcript = download_transcript(video_url)

	if not transcript:
	print("Failed to download transcript")
	sys.exit(1)

	print("Transcript downloaded successfully")
	print(f"Transcript length: {len(transcript)} characters")

	# Show first 200 chars to verify content
	print("\nFirst 200 characters of transcript:")
	print(transcript[:200] + "..." if len(transcript) > 200 else transcript)

	# Tuned to llama.cpp running Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf
	# from Unsloth, on my Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz
	# with 64GB of RAM
	estimated_minutes = len(transcript)*.0003032768268323+.054881491
	start_time = datetime.now()
	completion_time = start_time + timedelta(minutes=estimated_minutes)
	print(f"ETA of completion: {completion_time.strftime('%Y-%m-%d %H:%M:%S')}")

	print("\nSummarizing with LLM...")
	print("\n" + "="*50)
	print("SUMMARY")
	print("="*50)
	summary = summarize_with_llm(transcript)
	print("="*50)

	if not summary:
	print("Failed to get summary from LLM")
	sys.exit(1)

	end = time.time()
	print(f"Took {(end-start)/60:.2f} minutes.")
	interactive_console(summary)


	if __name__ == "__main__":
	main()