Skip to content

Instantly share code, notes, and snippets.

@roelven
Created June 25, 2025 15:08
Show Gist options
  • Save roelven/3a3e914d7a25ced2343c5de1af2d1d8b to your computer and use it in GitHub Desktop.
Save roelven/3a3e914d7a25ced2343c5de1af2d1d8b to your computer and use it in GitHub Desktop.
Python script to convert Youtube subtitles to txt files
#!/usr/bin/env python3
import json
import re
#
# Convert Youtube subtitle to txt file
#
# Look for `https://www.youtube.com/api/timedtext` in your browser network tab
# on a youtube video. Copy the response into a subtitle.json file and save this
# script in the same folder.
#
# run `$ python3 convert_subtitle.py` and a txt file should be generated with your
# subtitle!
#
def parse_subtitles_to_transcript(json_file, output_file):
"""
Parse subtitle JSON file and create a clean transcript.
"""
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Extract all text segments from events
transcript_parts = []
for event in data.get('events', []):
# Skip events without segments
if 'segs' not in event:
continue
# Extract text from each segment
segment_text = ""
for seg in event['segs']:
if 'utf8' in seg:
text = seg['utf8']
# Skip standalone newlines and music annotations
if text == '\n':
continue
if text.startswith('[') and text.endswith(']'):
continue
segment_text += text
# Add the segment text if it's not empty
if segment_text.strip():
transcript_parts.append(segment_text.strip())
# Join all parts into a coherent transcript
transcript = ' '.join(transcript_parts)
# Clean up the transcript
# Remove extra spaces
transcript = re.sub(r'\s+', ' ', transcript)
# Add proper punctuation and paragraph breaks for readability
# Split into sentences based on context and add proper formatting
sentences = []
current_sentence = ""
words = transcript.split()
for i, word in enumerate(words):
current_sentence += word + " "
# Simple sentence boundary detection
# This is basic but should work for most cases
if (len(current_sentence.split()) > 15 or
(len(current_sentence.split()) > 8 and i < len(words) - 1 and
words[i+1][0].isupper())):
sentences.append(current_sentence.strip())
current_sentence = ""
# Add the last sentence if any
if current_sentence.strip():
sentences.append(current_sentence.strip())
# Format the transcript with proper line breaks
formatted_transcript = ""
for i, sentence in enumerate(sentences):
formatted_transcript += sentence
if not sentence.endswith('.'):
formatted_transcript += "."
formatted_transcript += "\n\n"
# Write to output file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(formatted_transcript.strip())
print(f"Successfully converted subtitles to transcript: {output_file}")
print(f"Total sentences: {len(sentences)}")
except Exception as e:
print(f"Error converting subtitles: {e}")
if __name__ == "__main__":
parse_subtitles_to_transcript('subtitle.json', 'transcript.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment