Created
September 14, 2025 20:16
-
-
Save i80and/ff6b0790e77b7a1ee7e04257b9199118 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| from pathlib import Path | |
| from pydantic import BaseModel | |
| import ollama | |
| MODEL = 'qwen3:30b-a3b-thinking-2507-q4_K_M' | |
| SYSTEM_PROMPT = 'You are a translation engine which will translate subtitles accurately while maintaining context and natural flow.' | |
| # Define the schema for the response | |
| class TranslationResult(BaseModel): | |
| translation: str | |
| confident: bool | |
| def create_request_template(text: str, from_lang: str, to_lang: str, previous_corpus: str, next_corpus: str | None) -> str: | |
| parts = [ | |
| f"I have the following text in {from_lang} which I would like to translate to {to_lang}:\n\n", | |
| f'"{text}"\n\n' | |
| ] | |
| if previous_corpus: | |
| parts.extend([ | |
| "For context, here are the previous 3 subtitles (already translated):\n", | |
| previous_corpus, | |
| "\n\n" | |
| ]) | |
| if next_corpus: | |
| parts.extend([ | |
| "For additional context, here is the next subtitle (untranslated):\n", | |
| f'"{next_corpus}"\n\n' | |
| ]) | |
| parts.append("Please provide an accurate translation that maintains natural flow and context.") | |
| return ''.join(parts) | |
| def translate(text: str, from_lang: str, to_lang: str, previous_corpus: str = "", next_corpus: str | None = None) -> str: | |
| prompt = create_request_template(text, from_lang, to_lang, previous_corpus, next_corpus) | |
| response = ollama.chat( | |
| model=MODEL, | |
| messages=[ | |
| {'role': 'system', 'content': SYSTEM_PROMPT}, | |
| {'role': 'user', 'content': prompt} | |
| ], | |
| format=TranslationResult.model_json_schema(), # Use Pydantic to generate the schema or format=schema | |
| options={'temperature': 0}, # Make responses more deterministic | |
| ) | |
| translated_response = TranslationResult.model_validate_json(response.message.content) | |
| return translated_response.translation | |
| def parse_subtitles(text: str) -> list[tuple[int, str, str]]: | |
| """ | |
| Parse SRT subtitle file format. | |
| Returns a list of tuples: (subtitle_number, timestamp, subtitle_text) | |
| Aborts on invalid input. | |
| """ | |
| if not text.strip(): | |
| raise ValueError("Empty input text") | |
| subtitles = [] | |
| blocks = text.strip().split('\n\n') | |
| for block in blocks: | |
| lines = block.strip().split('\n') | |
| # Each subtitle block must have at least 3 lines: number, timestamp, text | |
| if len(lines) < 3: | |
| raise ValueError(f"Invalid subtitle block: {block}") | |
| # Parse subtitle number | |
| try: | |
| subtitle_num = int(lines[0].strip()) | |
| except ValueError: | |
| raise ValueError(f"Invalid subtitle number: {lines[0]}") | |
| # Extract timestamp (second line) | |
| timestamp = lines[1].strip() | |
| # Validate timestamp format (basic check for arrow) | |
| if ' --> ' not in timestamp: | |
| raise ValueError(f"Invalid timestamp format: {timestamp}") | |
| # Extract subtitle text (remaining lines joined) | |
| subtitle_text = '\n'.join(lines[2:]).strip() | |
| if not subtitle_text: | |
| raise ValueError(f"Empty subtitle text for subtitle {subtitle_num}") | |
| subtitles.append((subtitle_num, timestamp, subtitle_text)) | |
| return subtitles | |
| def main() -> None: | |
| input_srt = Path(sys.argv[1]).read_text() | |
| subtitles = parse_subtitles(input_srt) | |
| translated_subtitles = [] | |
| for i, (sequence_number, timestamp, subtitle_text) in enumerate(subtitles): | |
| # Build context from previous 3 translated subtitles | |
| previous_corpus = "" | |
| if translated_subtitles: | |
| recent_translations = translated_subtitles[-3:] # Get last 3 | |
| context_parts = [] | |
| for prev_num, prev_time, prev_text in recent_translations: | |
| context_parts.append(f"{prev_num}\n{prev_time}\n{prev_text}") | |
| previous_corpus = "\n\n".join(context_parts) | |
| # Get next subtitle for context (if available) | |
| next_corpus = None | |
| if i + 1 < len(subtitles): | |
| _, _, next_subtitle_text = subtitles[i + 1] | |
| next_corpus = next_subtitle_text | |
| # Translate with context | |
| translation = translate( | |
| subtitle_text, | |
| "Dutch", | |
| "English", | |
| previous_corpus, | |
| next_corpus | |
| ) | |
| # Store translated subtitle | |
| translated_subtitles.append((sequence_number, timestamp, translation)) | |
| # Output the translated subtitle in SRT format | |
| print(f"{sequence_number}") | |
| print(f"{timestamp}") | |
| print(f"{translation}") | |
| print() | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment