Created
November 19, 2017 02:11
-
-
Save augustomen/7309b747ab4e9c03d377ee9a97a25240 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
NEWLINE = '\r\n' | |
TIME_STR = re.compile(r'\d{2}\:\d{2}\:\d{2}[\,\.]\d{0,3}\s*--\>\s*\d{2}\:\d{2}\:\d{2}[\,\.]\d{0,3}') | |
TAGS_PATTERN = re.compile(r'\<[^\>]*\>') | |
def strip_tags(text): | |
return TAGS_PATTERN.sub(u'', text or u'').strip() | |
def translate_srt(from_lang, to_lang, input_file, output): | |
import mtranslate | |
script = open(input_file, 'rb').read().split(NEWLINE + NEWLINE) | |
chunk1 = [] | |
chunk2 = [] | |
for i, item in enumerate(script): | |
lines = item.replace('\xe2\x99\xaa', '\'').split(NEWLINE) | |
chunk1.append(NEWLINE.join(lines[:2])) | |
chunk2.append(strip_tags('|'.join(lines[2:]))) | |
if len(chunk1) >= 50 or i == len(script) - 1: | |
translated = mtranslate.translate( | |
' ||| '.join(chunk2), to_language=to_lang, from_language=from_lang) | |
translated = translated.encode('utf-8').split('|||') | |
# Try to fix | |
translated += ['...'] * (len(chunk1) - len(translated)) | |
for i1, i2 in zip(chunk1, translated): | |
output.write(i1) | |
output.write(NEWLINE) | |
output.write(NEWLINE.join(s.strip() for s in i2.split('|'))) | |
output.write(NEWLINE) | |
output.write(NEWLINE) | |
output.flush() | |
chunk1 = [] | |
chunk2 = [] | |
if __name__ == '__main__': | |
if len(sys.argv) != 4: | |
sys.stderr.write('''Usage: | |
python translate_srt.py <from_language> <to_language> input_file | |
''') | |
sys.exit(1) | |
translate_srt(sys.argv[1], sys.argv[2], sys.argv[3], sys.stdout) | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment