Skip to content

Instantly share code, notes, and snippets.

@ruphy
Created March 2, 2025 10:52
Show Gist options
  • Save ruphy/09e7976fb45b6374d7f6ccb64ecfd0b7 to your computer and use it in GitHub Desktop.
Save ruphy/09e7976fb45b6374d7f6ccb64ecfd0b7 to your computer and use it in GitHub Desktop.
VTT to TXT parser (serverless)
import re
import json
def convert_vtt_to_txt(raw_text):
# Improved regex to handle closing tags properly
pattern = re.compile(r"<v ([^>]+)>(.+?)</v>", re.DOTALL)
matches = pattern.findall(raw_text)
# Clean up whitespaces in dialogues
structured_lines = [
(match[0].strip(), re.sub(r'\s+', ' ', match[1].strip()))
for match in matches
]
combined_lines = []
previous_speaker = None
current_dialogue = []
for speaker, dialogue in structured_lines:
if speaker == previous_speaker:
current_dialogue.append(dialogue)
else:
if previous_speaker is not None:
combined_lines.append(f"{previous_speaker}: {' '.join(current_dialogue)}")
previous_speaker = speaker
current_dialogue = [dialogue]
if previous_speaker is not None:
combined_lines.append(f"{previous_speaker}: {' '.join(current_dialogue)}")
return "\n\n".join(combined_lines)
def handle(event, context):
try:
body = json.loads(event["body"])
text = body.get("text", "")
if not text:
raise ValueError("No text provided.")
cleaned_text = convert_vtt_to_txt(text)
return {
"statusCode": 200,
"headers": {"Content-Type": "application/json"},
"body": json.dumps({"result": cleaned_text})
}
except Exception as e:
return {
"statusCode": 400,
"headers": {"Content-Type": "application/json"},
"body": json.dumps({"error": str(e)})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment