Skip to content

Instantly share code, notes, and snippets.

@i80and
Created September 14, 2025 20:16
Show Gist options
  • Select an option

  • Save i80and/ff6b0790e77b7a1ee7e04257b9199118 to your computer and use it in GitHub Desktop.

Select an option

Save i80and/ff6b0790e77b7a1ee7e04257b9199118 to your computer and use it in GitHub Desktop.
import sys
from pathlib import Path
from pydantic import BaseModel
import ollama
MODEL = 'qwen3:30b-a3b-thinking-2507-q4_K_M'
SYSTEM_PROMPT = 'You are a translation engine which will translate subtitles accurately while maintaining context and natural flow.'
# Define the schema for the response
class TranslationResult(BaseModel):
translation: str
confident: bool
def create_request_template(text: str, from_lang: str, to_lang: str, previous_corpus: str, next_corpus: str | None) -> str:
parts = [
f"I have the following text in {from_lang} which I would like to translate to {to_lang}:\n\n",
f'"{text}"\n\n'
]
if previous_corpus:
parts.extend([
"For context, here are the previous 3 subtitles (already translated):\n",
previous_corpus,
"\n\n"
])
if next_corpus:
parts.extend([
"For additional context, here is the next subtitle (untranslated):\n",
f'"{next_corpus}"\n\n'
])
parts.append("Please provide an accurate translation that maintains natural flow and context.")
return ''.join(parts)
def translate(text: str, from_lang: str, to_lang: str, previous_corpus: str = "", next_corpus: str | None = None) -> str:
prompt = create_request_template(text, from_lang, to_lang, previous_corpus, next_corpus)
response = ollama.chat(
model=MODEL,
messages=[
{'role': 'system', 'content': SYSTEM_PROMPT},
{'role': 'user', 'content': prompt}
],
format=TranslationResult.model_json_schema(), # Use Pydantic to generate the schema or format=schema
options={'temperature': 0}, # Make responses more deterministic
)
translated_response = TranslationResult.model_validate_json(response.message.content)
return translated_response.translation
def parse_subtitles(text: str) -> list[tuple[int, str, str]]:
"""
Parse SRT subtitle file format.
Returns a list of tuples: (subtitle_number, timestamp, subtitle_text)
Aborts on invalid input.
"""
if not text.strip():
raise ValueError("Empty input text")
subtitles = []
blocks = text.strip().split('\n\n')
for block in blocks:
lines = block.strip().split('\n')
# Each subtitle block must have at least 3 lines: number, timestamp, text
if len(lines) < 3:
raise ValueError(f"Invalid subtitle block: {block}")
# Parse subtitle number
try:
subtitle_num = int(lines[0].strip())
except ValueError:
raise ValueError(f"Invalid subtitle number: {lines[0]}")
# Extract timestamp (second line)
timestamp = lines[1].strip()
# Validate timestamp format (basic check for arrow)
if ' --> ' not in timestamp:
raise ValueError(f"Invalid timestamp format: {timestamp}")
# Extract subtitle text (remaining lines joined)
subtitle_text = '\n'.join(lines[2:]).strip()
if not subtitle_text:
raise ValueError(f"Empty subtitle text for subtitle {subtitle_num}")
subtitles.append((subtitle_num, timestamp, subtitle_text))
return subtitles
def main() -> None:
input_srt = Path(sys.argv[1]).read_text()
subtitles = parse_subtitles(input_srt)
translated_subtitles = []
for i, (sequence_number, timestamp, subtitle_text) in enumerate(subtitles):
# Build context from previous 3 translated subtitles
previous_corpus = ""
if translated_subtitles:
recent_translations = translated_subtitles[-3:] # Get last 3
context_parts = []
for prev_num, prev_time, prev_text in recent_translations:
context_parts.append(f"{prev_num}\n{prev_time}\n{prev_text}")
previous_corpus = "\n\n".join(context_parts)
# Get next subtitle for context (if available)
next_corpus = None
if i + 1 < len(subtitles):
_, _, next_subtitle_text = subtitles[i + 1]
next_corpus = next_subtitle_text
# Translate with context
translation = translate(
subtitle_text,
"Dutch",
"English",
previous_corpus,
next_corpus
)
# Store translated subtitle
translated_subtitles.append((sequence_number, timestamp, translation))
# Output the translated subtitle in SRT format
print(f"{sequence_number}")
print(f"{timestamp}")
print(f"{translation}")
print()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment