Skip to content

Instantly share code, notes, and snippets.

@jaredyam
Last active June 2, 2022 09:07
Show Gist options
  • Save jaredyam/4fe7527ccf6981595a879c9705e56c51 to your computer and use it in GitHub Desktop.
Save jaredyam/4fe7527ccf6981595a879c9705e56c51 to your computer and use it in GitHub Desktop.
Translate a .srt file using Google Translate Ajax API
"""Translate .srt files using Google Translate Ajax API.
Usage
-----
$ python translate_srt.py *.srt [src=]en [dest=]zh-cn [-n *.srt] [-p 5] [-v]
Note:
- the available values to assign position arguments [src, dest] can be abtained from the below *AVAILABLE LANGUAGES* section.
- try to assign a high value to *patience* argument if you meet a high failure ratio. [-p 5] is recommended. [-p -1] expects no failures would be happen which instead requires a long period to bruce-force complete.
Essential notes from the API documentation
-------------------------------
- The maximum character limit on a single text is 15k.
- Due to limitations of the web version of google translate, this API does not guarantee that the library would work properly at all times. (so please use this library if you don’t care about stability.)
- If you want to use a stable API, I highly recommend you to use Google’s official translate API.
#6, it’s probably because Google has banned your client IP address.
- If you get HTTP 5xx error or errors like
AVAILABLE LANGUAGES
-------------------
LANGUAGES = {
'af': 'afrikaans',
'sq': 'albanian',
'am': 'amharic',
'ar': 'arabic',
'hy': 'armenian',
'az': 'azerbaijani',
'eu': 'basque',
'be': 'belarusian',
'bn': 'bengali',
'bs': 'bosnian',
'bg': 'bulgarian',
'ca': 'catalan',
'ceb': 'cebuano',
'ny': 'chichewa',
'zh-cn': 'chinese (simplified)',
'zh-tw': 'chinese (traditional)',
'co': 'corsican',
'hr': 'croatian',
'cs': 'czech',
'da': 'danish',
'nl': 'dutch',
'en': 'english',
'eo': 'esperanto',
'et': 'estonian',
'tl': 'filipino',
'fi': 'finnish',
'fr': 'french',
'fy': 'frisian',
'gl': 'galician',
'ka': 'georgian',
'de': 'german',
'el': 'greek',
'gu': 'gujarati',
'ht': 'haitian creole',
'ha': 'hausa',
'haw': 'hawaiian',
'iw': 'hebrew',
'hi': 'hindi',
'hmn': 'hmong',
'hu': 'hungarian',
'is': 'icelandic',
'ig': 'igbo',
'id': 'indonesian',
'ga': 'irish',
'it': 'italian',
'ja': 'japanese',
'jw': 'javanese',
'kn': 'kannada',
'kk': 'kazakh',
'km': 'khmer',
'ko': 'korean',
'ku': 'kurdish (kurmanji)',
'ky': 'kyrgyz',
'lo': 'lao',
'la': 'latin',
'lv': 'latvian',
'lt': 'lithuanian',
'lb': 'luxembourgish',
'mk': 'macedonian',
'mg': 'malagasy',
'ms': 'malay',
'ml': 'malayalam',
'mt': 'maltese',
'mi': 'maori',
'mr': 'marathi',
'mn': 'mongolian',
'my': 'myanmar (burmese)',
'ne': 'nepali',
'no': 'norwegian',
'ps': 'pashto',
'fa': 'persian',
'pl': 'polish',
'pt': 'portuguese',
'pa': 'punjabi',
'ro': 'romanian',
'ru': 'russian',
'sm': 'samoan',
'gd': 'scots gaelic',
'sr': 'serbian',
'st': 'sesotho',
'sn': 'shona',
'sd': 'sindhi',
'si': 'sinhala',
'sk': 'slovak',
'sl': 'slovenian',
'so': 'somali',
'es': 'spanish',
'su': 'sundanese',
'sw': 'swahili',
'sv': 'swedish',
'tg': 'tajik',
'ta': 'tamil',
'te': 'telugu',
'th': 'thai',
'tr': 'turkish',
'uk': 'ukrainian',
'ur': 'urdu',
'uz': 'uzbek',
'vi': 'vietnamese',
'cy': 'welsh',
'xh': 'xhosa',
'yi': 'yiddish',
'yo': 'yoruba',
'zu': 'zulu',
'fil': 'Filipino',
'he': 'Hebrew'
}
"""
import argparse
from googletrans import Translator
LANGUAGES = {
'af': 'afrikaans',
'sq': 'albanian',
'am': 'amharic',
'ar': 'arabic',
'hy': 'armenian',
'az': 'azerbaijani',
'eu': 'basque',
'be': 'belarusian',
'bn': 'bengali',
'bs': 'bosnian',
'bg': 'bulgarian',
'ca': 'catalan',
'ceb': 'cebuano',
'ny': 'chichewa',
'zh-cn': 'chinese (simplified)',
'zh-tw': 'chinese (traditional)',
'co': 'corsican',
'hr': 'croatian',
'cs': 'czech',
'da': 'danish',
'nl': 'dutch',
'en': 'english',
'eo': 'esperanto',
'et': 'estonian',
'tl': 'filipino',
'fi': 'finnish',
'fr': 'french',
'fy': 'frisian',
'gl': 'galician',
'ka': 'georgian',
'de': 'german',
'el': 'greek',
'gu': 'gujarati',
'ht': 'haitian creole',
'ha': 'hausa',
'haw': 'hawaiian',
'iw': 'hebrew',
'hi': 'hindi',
'hmn': 'hmong',
'hu': 'hungarian',
'is': 'icelandic',
'ig': 'igbo',
'id': 'indonesian',
'ga': 'irish',
'it': 'italian',
'ja': 'japanese',
'jw': 'javanese',
'kn': 'kannada',
'kk': 'kazakh',
'km': 'khmer',
'ko': 'korean',
'ku': 'kurdish (kurmanji)',
'ky': 'kyrgyz',
'lo': 'lao',
'la': 'latin',
'lv': 'latvian',
'lt': 'lithuanian',
'lb': 'luxembourgish',
'mk': 'macedonian',
'mg': 'malagasy',
'ms': 'malay',
'ml': 'malayalam',
'mt': 'maltese',
'mi': 'maori',
'mr': 'marathi',
'mn': 'mongolian',
'my': 'myanmar (burmese)',
'ne': 'nepali',
'no': 'norwegian',
'ps': 'pashto',
'fa': 'persian',
'pl': 'polish',
'pt': 'portuguese',
'pa': 'punjabi',
'ro': 'romanian',
'ru': 'russian',
'sm': 'samoan',
'gd': 'scots gaelic',
'sr': 'serbian',
'st': 'sesotho',
'sn': 'shona',
'sd': 'sindhi',
'si': 'sinhala',
'sk': 'slovak',
'sl': 'slovenian',
'so': 'somali',
'es': 'spanish',
'su': 'sundanese',
'sw': 'swahili',
'sv': 'swedish',
'tg': 'tajik',
'ta': 'tamil',
'te': 'telugu',
'th': 'thai',
'tr': 'turkish',
'uk': 'ukrainian',
'ur': 'urdu',
'uz': 'uzbek',
'vi': 'vietnamese',
'cy': 'welsh',
'xh': 'xhosa',
'yi': 'yiddish',
'yo': 'yoruba',
'zu': 'zulu',
'fil': 'Filipino',
'he': 'Hebrew'
}
def entries_generator(srt_file):
"""Generate a entries queue.
input:
srt_file: The original filename. [*.srt]
output:
entries: A queue generator.
"""
with open(srt_file, 'r') as srt:
while True:
# read lines in order
number_in_sequence = srt.readline()
timecode = srt.readline()
# whether it's the end of the file.
if not number_in_sequence:
break
# put all subtitles seperated by newline into a list.
subtitles = []
while True:
subtitle = srt.readline()
# whether it's the end of a entry.
if subtitle == '\n':
break
subtitles.append(subtitle)
yield number_in_sequence, timecode, subtitles
def translate(entries, src, dest, patience, verbose):
"""Generate the translated entries.
args:
entries: The entries queue.
src: The source language.
dest: The target language.
"""
translator = Translator()
count_failure = 0
count_entries = 0
for number_in_sequence, timecode, subtitles in entries:
count_entries += 1
translated_subtitles = []
for i, subtitle in enumerate(subtitles, 1):
# handle the special case: empty string.
if not subtitle:
translated_subtitles.append(subtitle)
continue
translated_subtitle = translator.translate(
subtitle, src=src, dest=dest).text
# handle the fail to translate case.
fail_to_translate = translated_subtitle[-1] == '\n'
while fail_to_translate and patience:
if verbose:
print('[Failure] Retry to translate...')
print(f'The translated subtitle: {translated_subtitle}', end='')
translated_subtitle = translator.translate(
translated_subtitle, src=src, dest=dest).text
if translated_subtitle[-1] == '\n':
if patience == -1:
continue
if patience == 1:
if verbose:
print(f'This subtitle failed to translate... [Position] entry {count_entries} line {i}')
patience -= 1
else:
fail_to_translate = False
if verbose:
print(f'Translate successfully. The result: {translated_subtitle}')
translated_subtitles.append(
translated_subtitle if fail_to_translate else translated_subtitle + '\n')
if verbose:
print('######################################################')
print(f'Current number in sequence: {count_entries}')
print(f'The tranlation result:')
print(f"{''.join(translated_subtitles)}")
print('######################################################')
else:
if fail_to_translate:
count_failure += 1
print(f'[{count_entries}] Failure to translate current entry...')
else:
print(f'[{count_entries}] Current entry has been translated...')
print(f'Total failures: {count_failure}/{count_entries}')
yield number_in_sequence, timecode, translated_subtitles, count_failure, count_entries
if __name__ == '__main__':
parser = argparse.ArgumentParser(
'\n$ python translate_srt.py *.srt [src=]en [dest=]zh-cn [-n *.srt] [-p 5] [-v]\n')
parser.add_argument('srt_file', type=str, help='requires a .srt file.')
parser.add_argument('src', type=str,
help=f'the source language of your .srt file. available languages: {LANGUAGES}')
parser.add_argument('dest', type=str,
help=f'the target language you would like to get. available languages: {LANGUAGES}')
parser.add_argument('-n', '--rename', type=str,
help='rename the output file.')
parser.add_argument('-p', '--patience', type=int,
help='the patience of retrying to translate. Expect a positive number. If -1 is assigned, the program will try for infinite times until there is no failures happened in the output.')
parser.add_argument('-v', '--verbose', action="store_true",
help='logs the translation process to console.')
args = parser.parse_args()
srt_file = args.srt_file
entries = entries_generator(srt_file)
translated_file = args.rename if args.rename else srt_file[
:-4] + '_translated.srt'
with open(translated_file, 'w') as f:
for number_in_sequence, timecode, subtitles, count_failure, count_entries in translate(entries, src=args.src, dest=args.dest, patience=args.patience, verbose=args.verbose):
f.write(number_in_sequence)
f.write(timecode)
for subtitle in subtitles:
f.write(subtitle)
f.write('\n')
print(f'Done. Please check: {translated_file}')
print(f'Total failure to translate entries: {count_failure}/{count_entries}')
failure_ratio = count_failure / count_entries
if failure_ratio > 0:
print(
'If you expect a lower failure ratio or completed translate, please check out the usage of [-p | --postion] argument.')
@botbahlul
Copy link

botbahlul commented Jun 2, 2022

@botbahlul

this script has never complete the execution

It should work in the case there has two newlines at the end of your .srt file. For these two newlines, one is read by the readline() method (line 263), and the other is used to meet the exit condition of a full timestamp entry (line 265).

Current script implementation checks the end of a timestamp entry with a newline separator, it will be too strict in some cases. A hot-fix for your case is to update the line 265, the script should exit this while loop when meeting an empty string, i.e., the end of the file.

I may rewrite this script in the following days, the updates may include some fixes and notes.

can't wait the update

I really love this script

BTW, is PATIENCE similar to CONCURRENCY? I can't find its routine on this script

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment