jaredyam · June 2, 2022 09:07 · botbahlul · Jun 2, 2022
diff --git a/translate-srt-subtitles.py b/translate-srt-subtitles.py
 """Translate .srt files using Google Translate Ajax API.

 Usage
 -----
 $ python translate_srt.py *.srt [src=]en [dest=]zh-cn [-n *.srt] [-p 5] [-v]

 Note:
 - the available values to assign position arguments [src, dest] can be abtained from the below *AVAILABLE LANGUAGES* section.
 - try to assign a high value to *patience* argument if you meet a high failure ratio. [-p 5] is recommended. [-p -1] expects no failures would be happen which instead requires a long period to bruce-force complete.

 Essential notes from the API documentation
 -------------------------------
 - The maximum character limit on a single text is 15k.
 - Due to limitations of the web version of google translate, this API does not guarantee that the library would work properly at all times. (so please use this library if you don’t care about stability.)
 - If you want to use a stable API, I highly recommend you to use Google’s official translate API.
 #6, it’s probably because Google has banned your client IP address.
 - If you get HTTP 5xx error or errors like

 AVAILABLE LANGUAGES
 -------------------
 LANGUAGES = {
    'af': 'afrikaans',
    'sq': 'albanian',
    'am': 'amharic',
    'ar': 'arabic',
    'hy': 'armenian',
    'az': 'azerbaijani',
    'eu': 'basque',
    'be': 'belarusian',
    'bn': 'bengali',
    'bs': 'bosnian',
    'bg': 'bulgarian',
    'ca': 'catalan',
    'ceb': 'cebuano',
    'ny': 'chichewa',
    'zh-cn': 'chinese (simplified)',
    'zh-tw': 'chinese (traditional)',
    'co': 'corsican',
    'hr': 'croatian',
    'cs': 'czech',
    'da': 'danish',
    'nl': 'dutch',
    'en': 'english',
    'eo': 'esperanto',
    'et': 'estonian',
    'tl': 'filipino',
    'fi': 'finnish',
    'fr': 'french',
    'fy': 'frisian',
    'gl': 'galician',
    'ka': 'georgian',
    'de': 'german',
    'el': 'greek',
    'gu': 'gujarati',
    'ht': 'haitian creole',
    'ha': 'hausa',
    'haw': 'hawaiian',
    'iw': 'hebrew',
    'hi': 'hindi',
    'hmn': 'hmong',
    'hu': 'hungarian',
    'is': 'icelandic',
    'ig': 'igbo',
    'id': 'indonesian',
    'ga': 'irish',
    'it': 'italian',
    'ja': 'japanese',
    'jw': 'javanese',
    'kn': 'kannada',
    'kk': 'kazakh',
    'km': 'khmer',
    'ko': 'korean',
    'ku': 'kurdish (kurmanji)',
    'ky': 'kyrgyz',
    'lo': 'lao',
    'la': 'latin',
    'lv': 'latvian',
    'lt': 'lithuanian',
    'lb': 'luxembourgish',
    'mk': 'macedonian',
    'mg': 'malagasy',
    'ms': 'malay',
    'ml': 'malayalam',
    'mt': 'maltese',
    'mi': 'maori',
    'mr': 'marathi',
    'mn': 'mongolian',
    'my': 'myanmar (burmese)',
    'ne': 'nepali',
    'no': 'norwegian',
    'ps': 'pashto',
    'fa': 'persian',
    'pl': 'polish',
    'pt': 'portuguese',
    'pa': 'punjabi',
    'ro': 'romanian',
    'ru': 'russian',
    'sm': 'samoan',
    'gd': 'scots gaelic',
    'sr': 'serbian',
    'st': 'sesotho',
    'sn': 'shona',
    'sd': 'sindhi',
    'si': 'sinhala',
    'sk': 'slovak',
    'sl': 'slovenian',
    'so': 'somali',
    'es': 'spanish',
    'su': 'sundanese',
    'sw': 'swahili',
    'sv': 'swedish',
    'tg': 'tajik',
    'ta': 'tamil',
    'te': 'telugu',
    'th': 'thai',
    'tr': 'turkish',
    'uk': 'ukrainian',
    'ur': 'urdu',
    'uz': 'uzbek',
    'vi': 'vietnamese',
    'cy': 'welsh',
    'xh': 'xhosa',
    'yi': 'yiddish',
    'yo': 'yoruba',
    'zu': 'zulu',
    'fil': 'Filipino',
    'he': 'Hebrew'
 }
 """
 import argparse
 from googletrans import Translator

 LANGUAGES = {
    'af': 'afrikaans',
    'sq': 'albanian',
    'am': 'amharic',
    'ar': 'arabic',
    'hy': 'armenian',
    'az': 'azerbaijani',
    'eu': 'basque',
    'be': 'belarusian',
    'bn': 'bengali',
    'bs': 'bosnian',
    'bg': 'bulgarian',
    'ca': 'catalan',
    'ceb': 'cebuano',
    'ny': 'chichewa',
    'zh-cn': 'chinese (simplified)',
    'zh-tw': 'chinese (traditional)',
    'co': 'corsican',
    'hr': 'croatian',
    'cs': 'czech',
    'da': 'danish',
    'nl': 'dutch',
    'en': 'english',
    'eo': 'esperanto',
    'et': 'estonian',
    'tl': 'filipino',
    'fi': 'finnish',
    'fr': 'french',
    'fy': 'frisian',
    'gl': 'galician',
    'ka': 'georgian',
    'de': 'german',
    'el': 'greek',
    'gu': 'gujarati',
    'ht': 'haitian creole',
    'ha': 'hausa',
    'haw': 'hawaiian',
    'iw': 'hebrew',
    'hi': 'hindi',
    'hmn': 'hmong',
    'hu': 'hungarian',
    'is': 'icelandic',
    'ig': 'igbo',
    'id': 'indonesian',
    'ga': 'irish',
    'it': 'italian',
    'ja': 'japanese',
    'jw': 'javanese',
    'kn': 'kannada',
    'kk': 'kazakh',
    'km': 'khmer',
    'ko': 'korean',
    'ku': 'kurdish (kurmanji)',
    'ky': 'kyrgyz',
    'lo': 'lao',
    'la': 'latin',
    'lv': 'latvian',
    'lt': 'lithuanian',
    'lb': 'luxembourgish',
    'mk': 'macedonian',
    'mg': 'malagasy',
    'ms': 'malay',
    'ml': 'malayalam',
    'mt': 'maltese',
    'mi': 'maori',
    'mr': 'marathi',
    'mn': 'mongolian',
    'my': 'myanmar (burmese)',
    'ne': 'nepali',
    'no': 'norwegian',
    'ps': 'pashto',
    'fa': 'persian',
    'pl': 'polish',
    'pt': 'portuguese',
    'pa': 'punjabi',
    'ro': 'romanian',
    'ru': 'russian',
    'sm': 'samoan',
    'gd': 'scots gaelic',
    'sr': 'serbian',
    'st': 'sesotho',
    'sn': 'shona',
    'sd': 'sindhi',
    'si': 'sinhala',
    'sk': 'slovak',
    'sl': 'slovenian',
    'so': 'somali',
    'es': 'spanish',
    'su': 'sundanese',
    'sw': 'swahili',
    'sv': 'swedish',
    'tg': 'tajik',
    'ta': 'tamil',
    'te': 'telugu',
    'th': 'thai',
    'tr': 'turkish',
    'uk': 'ukrainian',
    'ur': 'urdu',
    'uz': 'uzbek',
    'vi': 'vietnamese',
    'cy': 'welsh',
    'xh': 'xhosa',
    'yi': 'yiddish',
    'yo': 'yoruba',
    'zu': 'zulu',
    'fil': 'Filipino',
    'he': 'Hebrew'
 }


 def entries_generator(srt_file):
    """Generate a entries queue.

    input:
        srt_file: The original filename. [*.srt]

    output:
        entries: A queue generator.
    """
    with open(srt_file, 'r') as srt:
        while True:
            # read lines in order
            number_in_sequence = srt.readline()
            timecode = srt.readline()
            # whether it's the end of the file.
            if not number_in_sequence:
                break
            # put all subtitles seperated by newline into a list.
            subtitles = []
            while True:
                subtitle = srt.readline()
                # whether it's the end of a entry.
                if subtitle == '\n':
                    break
                subtitles.append(subtitle)
            yield number_in_sequence, timecode, subtitles


 def translate(entries, src, dest, patience, verbose):
    """Generate the translated entries.

    args:
        entries: The entries queue.
        src: The source language.
        dest: The target language.
    """
    translator = Translator()
    count_failure = 0
    count_entries = 0
    for number_in_sequence, timecode, subtitles in entries:
        count_entries += 1
        translated_subtitles = []
        for i, subtitle in enumerate(subtitles, 1):
            # handle the special case: empty string.
            if not subtitle:
                translated_subtitles.append(subtitle)
                continue
            translated_subtitle = translator.translate(
                subtitle, src=src, dest=dest).text
            # handle the fail to translate case.
            fail_to_translate = translated_subtitle[-1] == '\n'
            while fail_to_translate and patience:
                if verbose:
                    print('[Failure] Retry to translate...')
                    print(f'The translated subtitle: {translated_subtitle}', end='')

                translated_subtitle = translator.translate(
                    translated_subtitle, src=src, dest=dest).text
                if translated_subtitle[-1] == '\n':
                    if patience == -1:
                        continue
                    if patience == 1:
                        if verbose:
                            print(f'This subtitle failed to translate... [Position] entry {count_entries} line {i}')
                    patience -= 1
                else:
                    fail_to_translate = False
                    if verbose:
                        print(f'Translate successfully. The result: {translated_subtitle}')

            translated_subtitles.append(
                translated_subtitle if fail_to_translate else translated_subtitle + '\n')

        if verbose:
            print('######################################################')
            print(f'Current number in sequence: {count_entries}')
            print(f'The tranlation result:')
            print(f"{''.join(translated_subtitles)}")
            print('######################################################')
        else:
            if fail_to_translate:
                count_failure += 1
                print(f'[{count_entries}] Failure to translate current entry...')
            else:
                print(f'[{count_entries}] Current entry has been translated...')
            print(f'Total failures: {count_failure}/{count_entries}')
        yield number_in_sequence, timecode, translated_subtitles, count_failure, count_entries


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        '\n$ python translate_srt.py *.srt [src=]en [dest=]zh-cn [-n *.srt] [-p 5] [-v]\n')
    parser.add_argument('srt_file', type=str, help='requires a .srt file.')
    parser.add_argument('src', type=str,
                        help=f'the source language of your .srt file. available languages: {LANGUAGES}')
    parser.add_argument('dest', type=str,
                        help=f'the target language you would like to get. available languages: {LANGUAGES}')
    parser.add_argument('-n', '--rename', type=str,
                        help='rename the output file.')
    parser.add_argument('-p', '--patience', type=int,
                        help='the patience of retrying to translate. Expect a positive number.  If -1 is assigned, the program will try for infinite times until there is no failures happened in the output.')
    parser.add_argument('-v', '--verbose', action="store_true",
                        help='logs the translation process to console.')
    args = parser.parse_args()
    srt_file = args.srt_file
    entries = entries_generator(srt_file)

    translated_file = args.rename if args.rename else srt_file[
        :-4] + '_translated.srt'
    with open(translated_file, 'w') as f:
        for number_in_sequence, timecode, subtitles, count_failure, count_entries in translate(entries, src=args.src, dest=args.dest, patience=args.patience, verbose=args.verbose):
            f.write(number_in_sequence)
            f.write(timecode)
            for subtitle in subtitles:
                f.write(subtitle)
            f.write('\n')
        print(f'Done. Please check: {translated_file}')
        print(f'Total failure to translate entries: {count_failure}/{count_entries}')
        failure_ratio = count_failure / count_entries
        if failure_ratio > 0:
            print(
                'If you expect a lower failure ratio or completed translate, please check out the usage of [-p | --postion] argument.')
	"""Translate .srt files using Google Translate Ajax API.

	Usage
	-----
	$ python translate_srt.py .srt [src=]en [dest=]zh-cn [-n .srt] [-p 5] [-v]

	Note:
	- the available values to assign position arguments [src, dest] can be abtained from the below AVAILABLE LANGUAGES section.
	- try to assign a high value to patience argument if you meet a high failure ratio. [-p 5] is recommended. [-p -1] expects no failures would be happen which instead requires a long period to bruce-force complete.

	Essential notes from the API documentation
	-------------------------------
	- The maximum character limit on a single text is 15k.
	- Due to limitations of the web version of google translate, this API does not guarantee that the library would work properly at all times. (so please use this library if you don’t care about stability.)
	- If you want to use a stable API, I highly recommend you to use Google’s official translate API.
	#6, it’s probably because Google has banned your client IP address.
	- If you get HTTP 5xx error or errors like

	AVAILABLE LANGUAGES
	-------------------
	LANGUAGES = {
	'af': 'afrikaans',
	'sq': 'albanian',
	'am': 'amharic',
	'ar': 'arabic',
	'hy': 'armenian',
	'az': 'azerbaijani',
	'eu': 'basque',
	'be': 'belarusian',
	'bn': 'bengali',
	'bs': 'bosnian',
	'bg': 'bulgarian',
	'ca': 'catalan',
	'ceb': 'cebuano',
	'ny': 'chichewa',
	'zh-cn': 'chinese (simplified)',
	'zh-tw': 'chinese (traditional)',
	'co': 'corsican',
	'hr': 'croatian',
	'cs': 'czech',
	'da': 'danish',
	'nl': 'dutch',
	'en': 'english',
	'eo': 'esperanto',
	'et': 'estonian',
	'tl': 'filipino',
	'fi': 'finnish',
	'fr': 'french',
	'fy': 'frisian',
	'gl': 'galician',
	'ka': 'georgian',
	'de': 'german',
	'el': 'greek',
	'gu': 'gujarati',
	'ht': 'haitian creole',
	'ha': 'hausa',
	'haw': 'hawaiian',
	'iw': 'hebrew',
	'hi': 'hindi',
	'hmn': 'hmong',
	'hu': 'hungarian',
	'is': 'icelandic',
	'ig': 'igbo',
	'id': 'indonesian',
	'ga': 'irish',
	'it': 'italian',
	'ja': 'japanese',
	'jw': 'javanese',
	'kn': 'kannada',
	'kk': 'kazakh',
	'km': 'khmer',
	'ko': 'korean',
	'ku': 'kurdish (kurmanji)',
	'ky': 'kyrgyz',
	'lo': 'lao',
	'la': 'latin',
	'lv': 'latvian',
	'lt': 'lithuanian',
	'lb': 'luxembourgish',
	'mk': 'macedonian',
	'mg': 'malagasy',
	'ms': 'malay',
	'ml': 'malayalam',
	'mt': 'maltese',
	'mi': 'maori',
	'mr': 'marathi',
	'mn': 'mongolian',
	'my': 'myanmar (burmese)',
	'ne': 'nepali',
	'no': 'norwegian',
	'ps': 'pashto',
	'fa': 'persian',
	'pl': 'polish',
	'pt': 'portuguese',
	'pa': 'punjabi',
	'ro': 'romanian',
	'ru': 'russian',
	'sm': 'samoan',
	'gd': 'scots gaelic',
	'sr': 'serbian',
	'st': 'sesotho',
	'sn': 'shona',
	'sd': 'sindhi',
	'si': 'sinhala',
	'sk': 'slovak',
	'sl': 'slovenian',
	'so': 'somali',
	'es': 'spanish',
	'su': 'sundanese',
	'sw': 'swahili',
	'sv': 'swedish',
	'tg': 'tajik',
	'ta': 'tamil',
	'te': 'telugu',
	'th': 'thai',
	'tr': 'turkish',
	'uk': 'ukrainian',
	'ur': 'urdu',
	'uz': 'uzbek',
	'vi': 'vietnamese',
	'cy': 'welsh',
	'xh': 'xhosa',
	'yi': 'yiddish',
	'yo': 'yoruba',
	'zu': 'zulu',
	'fil': 'Filipino',
	'he': 'Hebrew'
	}
	"""
	import argparse
	from googletrans import Translator

	LANGUAGES = {
	'af': 'afrikaans',
	'sq': 'albanian',
	'am': 'amharic',
	'ar': 'arabic',
	'hy': 'armenian',
	'az': 'azerbaijani',
	'eu': 'basque',
	'be': 'belarusian',
	'bn': 'bengali',
	'bs': 'bosnian',
	'bg': 'bulgarian',
	'ca': 'catalan',
	'ceb': 'cebuano',
	'ny': 'chichewa',
	'zh-cn': 'chinese (simplified)',
	'zh-tw': 'chinese (traditional)',
	'co': 'corsican',
	'hr': 'croatian',
	'cs': 'czech',
	'da': 'danish',
	'nl': 'dutch',
	'en': 'english',
	'eo': 'esperanto',
	'et': 'estonian',
	'tl': 'filipino',
	'fi': 'finnish',
	'fr': 'french',
	'fy': 'frisian',
	'gl': 'galician',
	'ka': 'georgian',
	'de': 'german',
	'el': 'greek',
	'gu': 'gujarati',
	'ht': 'haitian creole',
	'ha': 'hausa',
	'haw': 'hawaiian',
	'iw': 'hebrew',
	'hi': 'hindi',
	'hmn': 'hmong',
	'hu': 'hungarian',
	'is': 'icelandic',
	'ig': 'igbo',
	'id': 'indonesian',
	'ga': 'irish',
	'it': 'italian',
	'ja': 'japanese',
	'jw': 'javanese',
	'kn': 'kannada',
	'kk': 'kazakh',
	'km': 'khmer',
	'ko': 'korean',
	'ku': 'kurdish (kurmanji)',
	'ky': 'kyrgyz',
	'lo': 'lao',
	'la': 'latin',
	'lv': 'latvian',
	'lt': 'lithuanian',
	'lb': 'luxembourgish',
	'mk': 'macedonian',
	'mg': 'malagasy',
	'ms': 'malay',
	'ml': 'malayalam',
	'mt': 'maltese',
	'mi': 'maori',
	'mr': 'marathi',
	'mn': 'mongolian',
	'my': 'myanmar (burmese)',
	'ne': 'nepali',
	'no': 'norwegian',
	'ps': 'pashto',
	'fa': 'persian',
	'pl': 'polish',
	'pt': 'portuguese',
	'pa': 'punjabi',
	'ro': 'romanian',
	'ru': 'russian',
	'sm': 'samoan',
	'gd': 'scots gaelic',
	'sr': 'serbian',
	'st': 'sesotho',
	'sn': 'shona',
	'sd': 'sindhi',
	'si': 'sinhala',
	'sk': 'slovak',
	'sl': 'slovenian',
	'so': 'somali',
	'es': 'spanish',
	'su': 'sundanese',
	'sw': 'swahili',
	'sv': 'swedish',
	'tg': 'tajik',
	'ta': 'tamil',
	'te': 'telugu',
	'th': 'thai',
	'tr': 'turkish',
	'uk': 'ukrainian',
	'ur': 'urdu',
	'uz': 'uzbek',
	'vi': 'vietnamese',
	'cy': 'welsh',
	'xh': 'xhosa',
	'yi': 'yiddish',
	'yo': 'yoruba',
	'zu': 'zulu',
	'fil': 'Filipino',
	'he': 'Hebrew'
	}


	def entries_generator(srt_file):
	"""Generate a entries queue.

	input:
	srt_file: The original filename. [*.srt]

	output:
	entries: A queue generator.
	"""
	with open(srt_file, 'r') as srt:
	while True:
	# read lines in order
	number_in_sequence = srt.readline()
	timecode = srt.readline()
	# whether it's the end of the file.
	if not number_in_sequence:
	break
	# put all subtitles seperated by newline into a list.
	subtitles = []
	while True:
	subtitle = srt.readline()
	# whether it's the end of a entry.
	if subtitle == '\n':
	break
	subtitles.append(subtitle)
	yield number_in_sequence, timecode, subtitles


	def translate(entries, src, dest, patience, verbose):
	"""Generate the translated entries.

	args:
	entries: The entries queue.
	src: The source language.
	dest: The target language.
	"""
	translator = Translator()
	count_failure = 0
	count_entries = 0
	for number_in_sequence, timecode, subtitles in entries:
	count_entries += 1
	translated_subtitles = []
	for i, subtitle in enumerate(subtitles, 1):
	# handle the special case: empty string.
	if not subtitle:
	translated_subtitles.append(subtitle)
	continue
	translated_subtitle = translator.translate(
	subtitle, src=src, dest=dest).text
	# handle the fail to translate case.
	fail_to_translate = translated_subtitle[-1] == '\n'
	while fail_to_translate and patience:
	if verbose:
	print('[Failure] Retry to translate...')
	print(f'The translated subtitle: {translated_subtitle}', end='')

	translated_subtitle = translator.translate(
	translated_subtitle, src=src, dest=dest).text
	if translated_subtitle[-1] == '\n':
	if patience == -1:
	continue
	if patience == 1:
	if verbose:
	print(f'This subtitle failed to translate... [Position] entry {count_entries} line {i}')
	patience -= 1
	else:
	fail_to_translate = False
	if verbose:
	print(f'Translate successfully. The result: {translated_subtitle}')

	translated_subtitles.append(
	translated_subtitle if fail_to_translate else translated_subtitle + '\n')

	if verbose:
	print('######################################################')
	print(f'Current number in sequence: {count_entries}')
	print(f'The tranlation result:')
	print(f"{''.join(translated_subtitles)}")
	print('######################################################')
	else:
	if fail_to_translate:
	count_failure += 1
	print(f'[{count_entries}] Failure to translate current entry...')
	else:
	print(f'[{count_entries}] Current entry has been translated...')
	print(f'Total failures: {count_failure}/{count_entries}')
	yield number_in_sequence, timecode, translated_subtitles, count_failure, count_entries


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	'\n$ python translate_srt.py .srt [src=]en [dest=]zh-cn [-n .srt] [-p 5] [-v]\n')
	parser.add_argument('srt_file', type=str, help='requires a .srt file.')
	parser.add_argument('src', type=str,
	help=f'the source language of your .srt file. available languages: {LANGUAGES}')
	parser.add_argument('dest', type=str,
	help=f'the target language you would like to get. available languages: {LANGUAGES}')
	parser.add_argument('-n', '--rename', type=str,
	help='rename the output file.')
	parser.add_argument('-p', '--patience', type=int,
	help='the patience of retrying to translate. Expect a positive number. If -1 is assigned, the program will try for infinite times until there is no failures happened in the output.')
	parser.add_argument('-v', '--verbose', action="store_true",
	help='logs the translation process to console.')
	args = parser.parse_args()
	srt_file = args.srt_file
	entries = entries_generator(srt_file)

	translated_file = args.rename if args.rename else srt_file[
	:-4] + '_translated.srt'
	with open(translated_file, 'w') as f:
	for number_in_sequence, timecode, subtitles, count_failure, count_entries in translate(entries, src=args.src, dest=args.dest, patience=args.patience, verbose=args.verbose):
	f.write(number_in_sequence)
	f.write(timecode)
	for subtitle in subtitles:
	f.write(subtitle)
	f.write('\n')
	print(f'Done. Please check: {translated_file}')
	print(f'Total failure to translate entries: {count_failure}/{count_entries}')
	failure_ratio = count_failure / count_entries
	if failure_ratio > 0:
	print(
	'If you expect a lower failure ratio or completed translate, please check out the usage of [-p \| --postion] argument.')