roelven · June 25, 2025 15:08
diff --git a/convert_subtitle.py b/convert_subtitle.py
 #!/usr/bin/env python3
 import json
 import re

 #
 # Convert Youtube subtitle to txt file
 #
 # Look for `https://www.youtube.com/api/timedtext` in your browser network tab
 # on a youtube video. Copy the response into a subtitle.json file and save this
 # script in the same folder. 
 #
 # run `$ python3 convert_subtitle.py` and a txt file should be generated with your
 # subtitle!
 # 

 def parse_subtitles_to_transcript(json_file, output_file):
    """
    Parse subtitle JSON file and create a clean transcript.
    """
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Extract all text segments from events
        transcript_parts = []
        
        for event in data.get('events', []):
            # Skip events without segments
            if 'segs' not in event:
                continue
            
            # Extract text from each segment
            segment_text = ""
            for seg in event['segs']:
                if 'utf8' in seg:
                    text = seg['utf8']
                    # Skip standalone newlines and music annotations
                    if text == '\n':
                        continue
                    if text.startswith('[') and text.endswith(']'):
                        continue
                    segment_text += text
            
            # Add the segment text if it's not empty
            if segment_text.strip():
                transcript_parts.append(segment_text.strip())
        
        # Join all parts into a coherent transcript
        transcript = ' '.join(transcript_parts)
        
        # Clean up the transcript
        # Remove extra spaces
        transcript = re.sub(r'\s+', ' ', transcript)
        
        # Add proper punctuation and paragraph breaks for readability
        # Split into sentences based on context and add proper formatting
        sentences = []
        current_sentence = ""
        
        words = transcript.split()
        for i, word in enumerate(words):
            current_sentence += word + " "
            
            # Simple sentence boundary detection
            # This is basic but should work for most cases
            if (len(current_sentence.split()) > 15 or 
                (len(current_sentence.split()) > 8 and i < len(words) - 1 and 
                 words[i+1][0].isupper())):
                sentences.append(current_sentence.strip())
                current_sentence = ""
        
        # Add the last sentence if any
        if current_sentence.strip():
            sentences.append(current_sentence.strip())
        
        # Format the transcript with proper line breaks
        formatted_transcript = ""
        for i, sentence in enumerate(sentences):
            formatted_transcript += sentence
            if not sentence.endswith('.'):
                formatted_transcript += "."
            formatted_transcript += "\n\n"
        
        # Write to output file
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(formatted_transcript.strip())
        
        print(f"Successfully converted subtitles to transcript: {output_file}")
        print(f"Total sentences: {len(sentences)}")
        
    except Exception as e:
        print(f"Error converting subtitles: {e}")

 if __name__ == "__main__":
    parse_subtitles_to_transcript('subtitle.json', 'transcript.txt')
	#!/usr/bin/env python3
	import json
	import re

	#
	# Convert Youtube subtitle to txt file
	#
	# Look for `https://www.youtube.com/api/timedtext` in your browser network tab
	# on a youtube video. Copy the response into a subtitle.json file and save this
	# script in the same folder.
	#
	# run `$ python3 convert_subtitle.py` and a txt file should be generated with your
	# subtitle!
	#

	def parse_subtitles_to_transcript(json_file, output_file):
	"""
	Parse subtitle JSON file and create a clean transcript.
	"""
	try:
	with open(json_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Extract all text segments from events
	transcript_parts = []

	for event in data.get('events', []):
	# Skip events without segments
	if 'segs' not in event:
	continue

	# Extract text from each segment
	segment_text = ""
	for seg in event['segs']:
	if 'utf8' in seg:
	text = seg['utf8']
	# Skip standalone newlines and music annotations
	if text == '\n':
	continue
	if text.startswith('[') and text.endswith(']'):
	continue
	segment_text += text

	# Add the segment text if it's not empty
	if segment_text.strip():
	transcript_parts.append(segment_text.strip())

	# Join all parts into a coherent transcript
	transcript = ' '.join(transcript_parts)

	# Clean up the transcript
	# Remove extra spaces
	transcript = re.sub(r'\s+', ' ', transcript)

	# Add proper punctuation and paragraph breaks for readability
	# Split into sentences based on context and add proper formatting
	sentences = []
	current_sentence = ""

	words = transcript.split()
	for i, word in enumerate(words):
	current_sentence += word + " "

	# Simple sentence boundary detection
	# This is basic but should work for most cases
	if (len(current_sentence.split()) > 15 or
	(len(current_sentence.split()) > 8 and i < len(words) - 1 and
	words[i+1][0].isupper())):
	sentences.append(current_sentence.strip())
	current_sentence = ""

	# Add the last sentence if any
	if current_sentence.strip():
	sentences.append(current_sentence.strip())

	# Format the transcript with proper line breaks
	formatted_transcript = ""
	for i, sentence in enumerate(sentences):
	formatted_transcript += sentence
	if not sentence.endswith('.'):
	formatted_transcript += "."
	formatted_transcript += "\n\n"

	# Write to output file
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(formatted_transcript.strip())

	print(f"Successfully converted subtitles to transcript: {output_file}")
	print(f"Total sentences: {len(sentences)}")

	except Exception as e:
	print(f"Error converting subtitles: {e}")

	if __name__ == "__main__":
	parse_subtitles_to_transcript('subtitle.json', 'transcript.txt')