Created
March 2, 2025 10:52
-
-
Save ruphy/09e7976fb45b6374d7f6ccb64ecfd0b7 to your computer and use it in GitHub Desktop.
VTT to TXT parser (serverless)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import json | |
def convert_vtt_to_txt(raw_text): | |
# Improved regex to handle closing tags properly | |
pattern = re.compile(r"<v ([^>]+)>(.+?)</v>", re.DOTALL) | |
matches = pattern.findall(raw_text) | |
# Clean up whitespaces in dialogues | |
structured_lines = [ | |
(match[0].strip(), re.sub(r'\s+', ' ', match[1].strip())) | |
for match in matches | |
] | |
combined_lines = [] | |
previous_speaker = None | |
current_dialogue = [] | |
for speaker, dialogue in structured_lines: | |
if speaker == previous_speaker: | |
current_dialogue.append(dialogue) | |
else: | |
if previous_speaker is not None: | |
combined_lines.append(f"{previous_speaker}: {' '.join(current_dialogue)}") | |
previous_speaker = speaker | |
current_dialogue = [dialogue] | |
if previous_speaker is not None: | |
combined_lines.append(f"{previous_speaker}: {' '.join(current_dialogue)}") | |
return "\n\n".join(combined_lines) | |
def handle(event, context): | |
try: | |
body = json.loads(event["body"]) | |
text = body.get("text", "") | |
if not text: | |
raise ValueError("No text provided.") | |
cleaned_text = convert_vtt_to_txt(text) | |
return { | |
"statusCode": 200, | |
"headers": {"Content-Type": "application/json"}, | |
"body": json.dumps({"result": cleaned_text}) | |
} | |
except Exception as e: | |
return { | |
"statusCode": 400, | |
"headers": {"Content-Type": "application/json"}, | |
"body": json.dumps({"error": str(e)}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment