Created
June 25, 2025 15:08
-
-
Save roelven/3a3e914d7a25ced2343c5de1af2d1d8b to your computer and use it in GitHub Desktop.
Python script to convert Youtube subtitles to txt files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
import re | |
# | |
# Convert Youtube subtitle to txt file | |
# | |
# Look for `https://www.youtube.com/api/timedtext` in your browser network tab | |
# on a youtube video. Copy the response into a subtitle.json file and save this | |
# script in the same folder. | |
# | |
# run `$ python3 convert_subtitle.py` and a txt file should be generated with your | |
# subtitle! | |
# | |
def parse_subtitles_to_transcript(json_file, output_file): | |
""" | |
Parse subtitle JSON file and create a clean transcript. | |
""" | |
try: | |
with open(json_file, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
# Extract all text segments from events | |
transcript_parts = [] | |
for event in data.get('events', []): | |
# Skip events without segments | |
if 'segs' not in event: | |
continue | |
# Extract text from each segment | |
segment_text = "" | |
for seg in event['segs']: | |
if 'utf8' in seg: | |
text = seg['utf8'] | |
# Skip standalone newlines and music annotations | |
if text == '\n': | |
continue | |
if text.startswith('[') and text.endswith(']'): | |
continue | |
segment_text += text | |
# Add the segment text if it's not empty | |
if segment_text.strip(): | |
transcript_parts.append(segment_text.strip()) | |
# Join all parts into a coherent transcript | |
transcript = ' '.join(transcript_parts) | |
# Clean up the transcript | |
# Remove extra spaces | |
transcript = re.sub(r'\s+', ' ', transcript) | |
# Add proper punctuation and paragraph breaks for readability | |
# Split into sentences based on context and add proper formatting | |
sentences = [] | |
current_sentence = "" | |
words = transcript.split() | |
for i, word in enumerate(words): | |
current_sentence += word + " " | |
# Simple sentence boundary detection | |
# This is basic but should work for most cases | |
if (len(current_sentence.split()) > 15 or | |
(len(current_sentence.split()) > 8 and i < len(words) - 1 and | |
words[i+1][0].isupper())): | |
sentences.append(current_sentence.strip()) | |
current_sentence = "" | |
# Add the last sentence if any | |
if current_sentence.strip(): | |
sentences.append(current_sentence.strip()) | |
# Format the transcript with proper line breaks | |
formatted_transcript = "" | |
for i, sentence in enumerate(sentences): | |
formatted_transcript += sentence | |
if not sentence.endswith('.'): | |
formatted_transcript += "." | |
formatted_transcript += "\n\n" | |
# Write to output file | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write(formatted_transcript.strip()) | |
print(f"Successfully converted subtitles to transcript: {output_file}") | |
print(f"Total sentences: {len(sentences)}") | |
except Exception as e: | |
print(f"Error converting subtitles: {e}") | |
if __name__ == "__main__": | |
parse_subtitles_to_transcript('subtitle.json', 'transcript.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment