Last active
June 2, 2022 09:07
-
-
Save jaredyam/4fe7527ccf6981595a879c9705e56c51 to your computer and use it in GitHub Desktop.
Translate a .srt file using Google Translate Ajax API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Translate .srt files using Google Translate Ajax API. | |
Usage | |
----- | |
$ python translate_srt.py *.srt [src=]en [dest=]zh-cn [-n *.srt] [-p 5] [-v] | |
Note: | |
- the available values to assign position arguments [src, dest] can be abtained from the below *AVAILABLE LANGUAGES* section. | |
- try to assign a high value to *patience* argument if you meet a high failure ratio. [-p 5] is recommended. [-p -1] expects no failures would be happen which instead requires a long period to bruce-force complete. | |
Essential notes from the API documentation | |
------------------------------- | |
- The maximum character limit on a single text is 15k. | |
- Due to limitations of the web version of google translate, this API does not guarantee that the library would work properly at all times. (so please use this library if you don’t care about stability.) | |
- If you want to use a stable API, I highly recommend you to use Google’s official translate API. | |
#6, it’s probably because Google has banned your client IP address. | |
- If you get HTTP 5xx error or errors like | |
AVAILABLE LANGUAGES | |
------------------- | |
LANGUAGES = { | |
'af': 'afrikaans', | |
'sq': 'albanian', | |
'am': 'amharic', | |
'ar': 'arabic', | |
'hy': 'armenian', | |
'az': 'azerbaijani', | |
'eu': 'basque', | |
'be': 'belarusian', | |
'bn': 'bengali', | |
'bs': 'bosnian', | |
'bg': 'bulgarian', | |
'ca': 'catalan', | |
'ceb': 'cebuano', | |
'ny': 'chichewa', | |
'zh-cn': 'chinese (simplified)', | |
'zh-tw': 'chinese (traditional)', | |
'co': 'corsican', | |
'hr': 'croatian', | |
'cs': 'czech', | |
'da': 'danish', | |
'nl': 'dutch', | |
'en': 'english', | |
'eo': 'esperanto', | |
'et': 'estonian', | |
'tl': 'filipino', | |
'fi': 'finnish', | |
'fr': 'french', | |
'fy': 'frisian', | |
'gl': 'galician', | |
'ka': 'georgian', | |
'de': 'german', | |
'el': 'greek', | |
'gu': 'gujarati', | |
'ht': 'haitian creole', | |
'ha': 'hausa', | |
'haw': 'hawaiian', | |
'iw': 'hebrew', | |
'hi': 'hindi', | |
'hmn': 'hmong', | |
'hu': 'hungarian', | |
'is': 'icelandic', | |
'ig': 'igbo', | |
'id': 'indonesian', | |
'ga': 'irish', | |
'it': 'italian', | |
'ja': 'japanese', | |
'jw': 'javanese', | |
'kn': 'kannada', | |
'kk': 'kazakh', | |
'km': 'khmer', | |
'ko': 'korean', | |
'ku': 'kurdish (kurmanji)', | |
'ky': 'kyrgyz', | |
'lo': 'lao', | |
'la': 'latin', | |
'lv': 'latvian', | |
'lt': 'lithuanian', | |
'lb': 'luxembourgish', | |
'mk': 'macedonian', | |
'mg': 'malagasy', | |
'ms': 'malay', | |
'ml': 'malayalam', | |
'mt': 'maltese', | |
'mi': 'maori', | |
'mr': 'marathi', | |
'mn': 'mongolian', | |
'my': 'myanmar (burmese)', | |
'ne': 'nepali', | |
'no': 'norwegian', | |
'ps': 'pashto', | |
'fa': 'persian', | |
'pl': 'polish', | |
'pt': 'portuguese', | |
'pa': 'punjabi', | |
'ro': 'romanian', | |
'ru': 'russian', | |
'sm': 'samoan', | |
'gd': 'scots gaelic', | |
'sr': 'serbian', | |
'st': 'sesotho', | |
'sn': 'shona', | |
'sd': 'sindhi', | |
'si': 'sinhala', | |
'sk': 'slovak', | |
'sl': 'slovenian', | |
'so': 'somali', | |
'es': 'spanish', | |
'su': 'sundanese', | |
'sw': 'swahili', | |
'sv': 'swedish', | |
'tg': 'tajik', | |
'ta': 'tamil', | |
'te': 'telugu', | |
'th': 'thai', | |
'tr': 'turkish', | |
'uk': 'ukrainian', | |
'ur': 'urdu', | |
'uz': 'uzbek', | |
'vi': 'vietnamese', | |
'cy': 'welsh', | |
'xh': 'xhosa', | |
'yi': 'yiddish', | |
'yo': 'yoruba', | |
'zu': 'zulu', | |
'fil': 'Filipino', | |
'he': 'Hebrew' | |
} | |
""" | |
import argparse | |
from googletrans import Translator | |
LANGUAGES = { | |
'af': 'afrikaans', | |
'sq': 'albanian', | |
'am': 'amharic', | |
'ar': 'arabic', | |
'hy': 'armenian', | |
'az': 'azerbaijani', | |
'eu': 'basque', | |
'be': 'belarusian', | |
'bn': 'bengali', | |
'bs': 'bosnian', | |
'bg': 'bulgarian', | |
'ca': 'catalan', | |
'ceb': 'cebuano', | |
'ny': 'chichewa', | |
'zh-cn': 'chinese (simplified)', | |
'zh-tw': 'chinese (traditional)', | |
'co': 'corsican', | |
'hr': 'croatian', | |
'cs': 'czech', | |
'da': 'danish', | |
'nl': 'dutch', | |
'en': 'english', | |
'eo': 'esperanto', | |
'et': 'estonian', | |
'tl': 'filipino', | |
'fi': 'finnish', | |
'fr': 'french', | |
'fy': 'frisian', | |
'gl': 'galician', | |
'ka': 'georgian', | |
'de': 'german', | |
'el': 'greek', | |
'gu': 'gujarati', | |
'ht': 'haitian creole', | |
'ha': 'hausa', | |
'haw': 'hawaiian', | |
'iw': 'hebrew', | |
'hi': 'hindi', | |
'hmn': 'hmong', | |
'hu': 'hungarian', | |
'is': 'icelandic', | |
'ig': 'igbo', | |
'id': 'indonesian', | |
'ga': 'irish', | |
'it': 'italian', | |
'ja': 'japanese', | |
'jw': 'javanese', | |
'kn': 'kannada', | |
'kk': 'kazakh', | |
'km': 'khmer', | |
'ko': 'korean', | |
'ku': 'kurdish (kurmanji)', | |
'ky': 'kyrgyz', | |
'lo': 'lao', | |
'la': 'latin', | |
'lv': 'latvian', | |
'lt': 'lithuanian', | |
'lb': 'luxembourgish', | |
'mk': 'macedonian', | |
'mg': 'malagasy', | |
'ms': 'malay', | |
'ml': 'malayalam', | |
'mt': 'maltese', | |
'mi': 'maori', | |
'mr': 'marathi', | |
'mn': 'mongolian', | |
'my': 'myanmar (burmese)', | |
'ne': 'nepali', | |
'no': 'norwegian', | |
'ps': 'pashto', | |
'fa': 'persian', | |
'pl': 'polish', | |
'pt': 'portuguese', | |
'pa': 'punjabi', | |
'ro': 'romanian', | |
'ru': 'russian', | |
'sm': 'samoan', | |
'gd': 'scots gaelic', | |
'sr': 'serbian', | |
'st': 'sesotho', | |
'sn': 'shona', | |
'sd': 'sindhi', | |
'si': 'sinhala', | |
'sk': 'slovak', | |
'sl': 'slovenian', | |
'so': 'somali', | |
'es': 'spanish', | |
'su': 'sundanese', | |
'sw': 'swahili', | |
'sv': 'swedish', | |
'tg': 'tajik', | |
'ta': 'tamil', | |
'te': 'telugu', | |
'th': 'thai', | |
'tr': 'turkish', | |
'uk': 'ukrainian', | |
'ur': 'urdu', | |
'uz': 'uzbek', | |
'vi': 'vietnamese', | |
'cy': 'welsh', | |
'xh': 'xhosa', | |
'yi': 'yiddish', | |
'yo': 'yoruba', | |
'zu': 'zulu', | |
'fil': 'Filipino', | |
'he': 'Hebrew' | |
} | |
def entries_generator(srt_file): | |
"""Generate a entries queue. | |
input: | |
srt_file: The original filename. [*.srt] | |
output: | |
entries: A queue generator. | |
""" | |
with open(srt_file, 'r') as srt: | |
while True: | |
# read lines in order | |
number_in_sequence = srt.readline() | |
timecode = srt.readline() | |
# whether it's the end of the file. | |
if not number_in_sequence: | |
break | |
# put all subtitles seperated by newline into a list. | |
subtitles = [] | |
while True: | |
subtitle = srt.readline() | |
# whether it's the end of a entry. | |
if subtitle == '\n': | |
break | |
subtitles.append(subtitle) | |
yield number_in_sequence, timecode, subtitles | |
def translate(entries, src, dest, patience, verbose): | |
"""Generate the translated entries. | |
args: | |
entries: The entries queue. | |
src: The source language. | |
dest: The target language. | |
""" | |
translator = Translator() | |
count_failure = 0 | |
count_entries = 0 | |
for number_in_sequence, timecode, subtitles in entries: | |
count_entries += 1 | |
translated_subtitles = [] | |
for i, subtitle in enumerate(subtitles, 1): | |
# handle the special case: empty string. | |
if not subtitle: | |
translated_subtitles.append(subtitle) | |
continue | |
translated_subtitle = translator.translate( | |
subtitle, src=src, dest=dest).text | |
# handle the fail to translate case. | |
fail_to_translate = translated_subtitle[-1] == '\n' | |
while fail_to_translate and patience: | |
if verbose: | |
print('[Failure] Retry to translate...') | |
print(f'The translated subtitle: {translated_subtitle}', end='') | |
translated_subtitle = translator.translate( | |
translated_subtitle, src=src, dest=dest).text | |
if translated_subtitle[-1] == '\n': | |
if patience == -1: | |
continue | |
if patience == 1: | |
if verbose: | |
print(f'This subtitle failed to translate... [Position] entry {count_entries} line {i}') | |
patience -= 1 | |
else: | |
fail_to_translate = False | |
if verbose: | |
print(f'Translate successfully. The result: {translated_subtitle}') | |
translated_subtitles.append( | |
translated_subtitle if fail_to_translate else translated_subtitle + '\n') | |
if verbose: | |
print('######################################################') | |
print(f'Current number in sequence: {count_entries}') | |
print(f'The tranlation result:') | |
print(f"{''.join(translated_subtitles)}") | |
print('######################################################') | |
else: | |
if fail_to_translate: | |
count_failure += 1 | |
print(f'[{count_entries}] Failure to translate current entry...') | |
else: | |
print(f'[{count_entries}] Current entry has been translated...') | |
print(f'Total failures: {count_failure}/{count_entries}') | |
yield number_in_sequence, timecode, translated_subtitles, count_failure, count_entries | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
'\n$ python translate_srt.py *.srt [src=]en [dest=]zh-cn [-n *.srt] [-p 5] [-v]\n') | |
parser.add_argument('srt_file', type=str, help='requires a .srt file.') | |
parser.add_argument('src', type=str, | |
help=f'the source language of your .srt file. available languages: {LANGUAGES}') | |
parser.add_argument('dest', type=str, | |
help=f'the target language you would like to get. available languages: {LANGUAGES}') | |
parser.add_argument('-n', '--rename', type=str, | |
help='rename the output file.') | |
parser.add_argument('-p', '--patience', type=int, | |
help='the patience of retrying to translate. Expect a positive number. If -1 is assigned, the program will try for infinite times until there is no failures happened in the output.') | |
parser.add_argument('-v', '--verbose', action="store_true", | |
help='logs the translation process to console.') | |
args = parser.parse_args() | |
srt_file = args.srt_file | |
entries = entries_generator(srt_file) | |
translated_file = args.rename if args.rename else srt_file[ | |
:-4] + '_translated.srt' | |
with open(translated_file, 'w') as f: | |
for number_in_sequence, timecode, subtitles, count_failure, count_entries in translate(entries, src=args.src, dest=args.dest, patience=args.patience, verbose=args.verbose): | |
f.write(number_in_sequence) | |
f.write(timecode) | |
for subtitle in subtitles: | |
f.write(subtitle) | |
f.write('\n') | |
print(f'Done. Please check: {translated_file}') | |
print(f'Total failure to translate entries: {count_failure}/{count_entries}') | |
failure_ratio = count_failure / count_entries | |
if failure_ratio > 0: | |
print( | |
'If you expect a lower failure ratio or completed translate, please check out the usage of [-p | --postion] argument.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
can't wait the update
I really love this script
BTW, is PATIENCE similar to CONCURRENCY? I can't find its routine on this script